In [1]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting bleach (from kaggle)
  Using cached bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Using cached bleach-6.2.0-py3-none-any.whl (163 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105839 sha256=a35e67651362caefba30a79a24fb0153ac167fa5109d9ac348d7c963ad161ff8
  Stored in directory: /Users/yuuvjauhari/Library/Caches/pip/wheels/9f/af/22/bf406f913dc7506a485e60dce8143741abd0a92a1

In [None]:
import os
import pandas as pd
#Download the dataset manually from Kaggle and place in the same folder as this jupyter notebook
data_dir = "archive"  

# Define expected categories and labels
expected_categories = {
    "Center Data": 0,
    "Right Data": 1,
    "Left Data": 2
}

data = []

# Loop through each category folder
for category, label in expected_categories.items():
    category_path = os.path.join(data_dir, category)

    # Make sure the folder exists before processing
    if not os.path.isdir(category_path):
        print(f"⚠ Skipping missing folder: {category}")
        continue

    # Walk through all subdirectories and files
    for root, _, files in os.walk(category_path):
        for filename in files:
            file_path = os.path.join(root, filename)

            try:
                # Try reading in UTF-8 first
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read()
            except UnicodeDecodeError:
                try:
                    # If UTF-8 fails, try ISO-8859-1 
                    with open(file_path, "r", encoding="ISO-8859-1") as file:
                        text = file.read()
                except Exception as e:
                    print(f"❌ Error reading file: {file_path} | Skipping... Error: {e}")
                    continue  # Skip the problematic file

            # Append text and label to dataset list
            data.append({"text": text, "label": label})

df = pd.DataFrame(data)

print(df.head())
print(f"Loaded {len(df)} news articles.")



                                                text  label
0     Bud1                                 ...      0
1  The Trump administration is shattering a grues...      0
2                                                ...      0
3  WASHINGTON (AP) — AstraZeneca reported Monday ...      0
4  Donald Trump says the government should get a ...      0
✔ Successfully loaded 17365 news articles.


In [None]:
from collections import Counter

label_counts = Counter(df["label"])

for label, count in label_counts.items():
    print(f"Label {label}: {count} articles")

label_mapping = {0: "Center", 1: "Right", 2: "Left"}
for label, count in label_counts.items():
    print(f"{label_mapping[label]}: {count} articles")


Label 0: 3997 articles
Label 1: 5564 articles
Label 2: 7804 articles
Center: 3997 articles
Right: 5564 articles
Left: 7804 articles


In [None]:

target_size = min(label_counts.values())  # 3997 (Center articles)

# Downsample Right & Left articles
df_center = df[df["label"] == 0] 
df_right = df[df["label"] == 1].sample(target_size, random_state=42)
df_left = df[df["label"] == 2].sample(target_size, random_state=42)

df_balanced = pd.concat([df_center, df_right, df_left]).sample(frac=1, random_state=42)  # Shuffle

print(df_balanced["label"].value_counts())


label
0    3997
2    3997
1    3997
Name: count, dtype: int64


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_balanced)

print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 11991
})


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset)


Map: 100%|██████████| 11991/11991 [00:08<00:00, 1416.10 examples/s]

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 11991
})





In [None]:
from datasets import DatasetDict

train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

valid_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": valid_test_split["train"],
    "test": valid_test_split["test"]
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 9592
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1199
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
})


In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"]
)

# Train the model
trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4908,0.299575
2,0.2538,0.278293
3,0.1909,0.305926


TrainOutput(global_step=1800, training_loss=0.28735706753200957, metrics={'train_runtime': 2938.6149, 'train_samples_per_second': 9.792, 'train_steps_per_second': 0.613, 'total_flos': 7571351708540928.0, 'train_loss': 0.28735706753200957, 'epoch': 3.0})

In [None]:

results = trainer.evaluate(dataset["test"])
print("Test set results:", results)


Test set results: {'eval_loss': 0.24801483750343323, 'eval_runtime': 41.7505, 'eval_samples_per_second': 28.742, 'eval_steps_per_second': 1.796, 'epoch': 3.0}


In [28]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

label_mapping = {
    "LABEL_0": "Center",
    "LABEL_1": "Right",
    "LABEL_2": "Left"
}

# Examples
examples = [
    "The new policy aims to support small businesses through tax incentives.",
    "Government overreach is destroying personal freedoms.",
    "A balanced approach to social programs is necessary.",
    "Tax cuts for the wealthy will improve economic growth, says administration.",
    "Protesters demand stronger action on climate change from the government.",
    "Finally, taxpayer money is being redirected away from these left-wing indoctrination centers. It is encouraging to see that Trump is not just targeting Ivy League schools but extending this crackdown to universities across the board.",
    "The US plans to impose a 25% tariff on steel imports, but UK shares rose instead of falling in response to the news.",
    "After the ceasefire in Gaza, West Bank Palestinians face more Israeli barriers, traffic and misery",
    "Man charged over 'attempted murder of police officer' in Clydebank"

]

for text in examples:
    prediction = classifier(text)
    
    predicted_label = prediction[0]["label"]

    readable_label = label_mapping[predicted_label]

    confidence = prediction[0]["score"]

    print(f"Text: {text}\nPredicted Bias: {readable_label} (Confidence: {confidence:.2f})\n")



Device set to use mps:0


Text: The new policy aims to support small businesses through tax incentives.
Predicted Bias: Left (Confidence: 0.74)

Text: Government overreach is destroying personal freedoms.
Predicted Bias: Left (Confidence: 0.47)

Text: A balanced approach to social programs is necessary.
Predicted Bias: Left (Confidence: 0.79)

Text: Tax cuts for the wealthy will improve economic growth, says administration.
Predicted Bias: Left (Confidence: 0.96)

Text: Protesters demand stronger action on climate change from the government.
Predicted Bias: Left (Confidence: 0.79)

Text: Finally, taxpayer money is being redirected away from these left-wing indoctrination centers. It is encouraging to see that Trump is not just targeting Ivy League schools but extending this crackdown to universities across the board.
Predicted Bias: Right (Confidence: 0.79)

Text: The US plans to impose a 25% tariff on steel imports, but UK shares rose instead of falling in response to the news.
Predicted Bias: Left (Confidence

In [25]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

preds = trainer.predict(dataset["test"])
y_preds = np.argmax(preds.predictions, axis=1)  
y_true = np.array(dataset["test"]["label"])  
accuracy = accuracy_score(y_true, y_preds)
print(f"Test Accuracy: {accuracy:.4f}")

print(classification_report(y_true, y_preds, target_names=["Center", "Right", "Left"]))


Test Accuracy: 0.9292
              precision    recall  f1-score   support

      Center       0.95      0.91      0.93       419
       Right       0.96      0.95      0.95       403
        Left       0.89      0.94      0.91       378

    accuracy                           0.93      1200
   macro avg       0.93      0.93      0.93      1200
weighted avg       0.93      0.93      0.93      1200



In [None]:
import news_signals
import json
import requests

NEWSAPI_APP_KEY = ""
NEWSAPI_APP_ID = ""
HEADERS = {
    'X-AYLIEN-NewsAPI-Application-ID': NEWSAPI_APP_ID,
    'X-AYLIEN-NewsAPI-Application-Key': NEWSAPI_APP_KEY
}
                               


In [None]:
params = {
    "published_at": "[1DAY-NOW/DAY TO NOW]",
    "language": "(en)",
    "categories": "{{taxonomy:aylien AND id:(ay.appsci) AND score:>=0.65}}",
    "source.rankings.alexa.rank.min": "1",
    "source.rankings.alexa.rank.max": 100,
    "per_page": 100,
}

response = requests.get(
    url='https://api.aylien.com/v6/news/stories',
    params=params,
    headers=HEADERS
)
result = json.loads(response.content)
for s in result['stories']:
    print(f"Published At: {s['published_at']}")
    print(f"Title: {s['title']}")
    print(f"Body: {s['body']}") 
    print('-' * 80)

Published At: 2025-02-10T17:22:21Z
Title: Top Stock Movers Now: Cleveland-Cliffs, Rockwell Automation, Onsemi, and More
Body: Top Stock Movers Now: Cleveland-Cliffs, Rockwell Automation, Onsemi, and More  Bill McColl  Mon, February 10, 2025 at 5:55 PM GMT+1
 1 min read
    In This Article
  ON
   ROK
   ^GSPC
   Dustin Franz / Bloomberg via Getty Images
  Key Takeaways
  U.S. equities advanced at midday as tech companies and steel and aluminum manufacturers drove the market higher.
  President Donald Trump said he would impose 25% tariffs on steel and aluminum imports.
  ON Semiconductor said it continues to face difficult market conditions.
  U.S. equities were higher at midday, lifted by tech shares and steel and aluminum producers. The Nasdaq added 1%, and the Dow Jones Industrial Average and S&P 500 were higher as well.
  Shares of Nucor ( NUE ), Cleveland-Cliffs ( CLF ), and Alcoa ( AA ) were among the big winners after President Donald Trump said he would be slapping 25% tariffs 

In [None]:
classified_articles = []

for s in result["stories"]:
    article_body = s.get("body", "") 

    if not article_body.strip():  
        continue

    prediction = classifier(article_body[:512])[0]  # Truncate to 512 tokens

    predicted_label = label_mapping[prediction["label"]]
    confidence = round(prediction["score"], 2)

    classified_articles.append({
        "Published At": s["published_at"],
        "Title": s["title"],
        "Bias": predicted_label,
        "Confidence": confidence
    })

df_results = pd.DataFrame(classified_articles)
print(df_results)

df_results.to_csv("news_bias_results.csv", index=False)
print("Results saved to 'news_bias_results.csv'.")

            Published At                                              Title  \
0   2025-02-10T17:22:21Z  Top Stock Movers Now: Cleveland-Cliffs, Rockwe...   
1   2025-02-10T17:22:03Z  Earth’s Inner Core Has Warped Over the Past Tw...   
2   2025-02-10T17:21:50Z  Startup Nation Central &amp; GrowingIL Launche...   
3   2025-02-10T17:20:07Z  AINewsBreaks – Meta Platforms Inc. (NASDAQ: ME...   
4   2025-02-10T17:17:31Z  TOTAL PLAY SUCCESSFULLY CONCLUDES DEBT EXCHANG...   
..                   ...                                                ...   
95  2025-02-10T15:37:34Z  Space telescope spots rare 'Einstein ring' of ...   
96  2025-02-10T15:37:33Z  Cloudflare, Inc. (NET): This Software Infrastr...   
97  2025-02-10T15:36:51Z  CompassMSP Recognized on CRN's Prestigious MSP...   
98  2025-02-10T15:36:48Z  New Research from athenahealth Shows Patient D...   
99  2025-02-10T15:36:25Z  Arkema Partners with ISN® to Enhance Contracto...   

      Bias  Confidence  
0   Center        0.96  
1