In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [21]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch

### Data Preparation

In [45]:
cc_df = pd.read_csv("data/ai_climate_israel_articles/climate-change_articles.csv")

In [46]:
cc_df['article_text'] = cc_df['article_text'].astype(str)
cc_df.dropna(subset=['article_text'], inplace=True)
cc_df = cc_df[cc_df['article_text'].str.len() >= 512].copy()

In [47]:
cc_df.shape

(48900, 25)

In [49]:
# remove unknown source bias and duplicate url articles
cc_df = cc_df[cc_df['source_bias'] != 'unknown'].copy()
cc_df.shape

(19176, 25)

In [76]:
cc_df_demo = cc_df.sample(100, random_state=42)

### Pretrained Model for Bias Classification

In [None]:
# repository for the pre-trained model
repository = "premsa/political-bias-prediction-allsides-BERT"

model = AutoModelForSequenceClassification.from_pretrained(repository)
tokenizer = AutoTokenizer.from_pretrained(repository)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [132]:
label_mapping = {
    0: "Left",
    1: "Center",
    2: "Right"
}

In [133]:
# chunk texts
def chunk_token_ids(text, tokenizer, max_length=512):
    encoded = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    effective_length = max_length - 2
    for i in range(0, len(encoded), effective_length):
        chunk_ids = encoded[i:i+effective_length]
        chunk_ids = [tokenizer.cls_token_id] + chunk_ids + [tokenizer.sep_token_id]
        chunk_ids = chunk_ids[:max_length]
        chunks.append(chunk_ids)
    return chunks

In [134]:
chunks_list = [chunk_token_ids(article, tokenizer, max_length=512)
               for article in tqdm(cc_df_demo["article_text"], desc="Chunking articles")]

all_chunks = [chunk for article_chunks in chunks_list for chunk in article_chunks]

article_chunk_map = {}
current_index = 0
for idx, article_chunks in enumerate(chunks_list):
    n_chunks = len(article_chunks)
    article_chunk_map[idx] = list(range(current_index, current_index + n_chunks))
    current_index += n_chunks

Chunking articles:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (917 > 512). Running this sequence through the model will result in indexing errors
Chunking articles: 100%|██████████| 100/100 [00:00<00:00, 477.67it/s]


In [135]:
batch_size = 32 
all_probs = [] 

for i in tqdm(range(0, len(all_chunks), batch_size), desc="Processing chunks"):
    batch_chunks = all_chunks[i:i+batch_size]
  
    batch_inputs = tokenizer.pad({"input_ids": batch_chunks}, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**batch_inputs)
    logits = outputs.logits
    # Apply softmax to geat probabilities
    probs = torch.softmax(logits, dim=-1).cpu().numpy()
    all_probs.extend(probs)

Processing chunks:   0%|          | 0/8 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Processing chunks: 100%|██████████| 8/8 [00:37<00:00,  4.66s/it]


In [136]:
predicted_labels = []
for idx in range(len(cc_df_demo)):
    indices = article_chunk_map.get(idx, [])
    if not indices:
        predicted_labels.append(None)
        continue
    # Average probabilities for all chunks of this article.
    article_probs = [all_probs[i] for i in indices]
    avg_probs = np.mean(article_probs, axis=0)
    final_idx = int(np.argmax(avg_probs))
    predicted_labels.append(label_mapping[final_idx])

cc_df_demo["article_bias"] = predicted_labels

In [137]:
cc_df_demo["article_bias"].value_counts()

article_bias
Left      77
Right     14
Center     9
Name: count, dtype: int64

In [138]:
cc_df_demo["source_bias"].value_counts()

source_bias
center       37
leanLeft     37
left          8
leanRight     8
right         7
farRight      3
Name: count, dtype: int64