In [None]:
!pip install kaggle

In [11]:
import os
import pandas as pd
#Download the dataset manually from Kaggle and place in the same folder as this jupyter notebook
data_dir = "archive"  

# Define expected categories and labels
expected_categories = {
    "Center Data": 0,
    "Right Data": 1,
    "Left Data": 2
}

data = []

# Loop through each category folder
for category, label in expected_categories.items():
    category_path = os.path.join(data_dir, category)

    # Make sure the folder exists before processing
    if not os.path.isdir(category_path):
        print(f"⚠ Skipping missing folder: {category}")
        continue

    # Walk through all subdirectories and files
    for root, _, files in os.walk(category_path):
        for filename in files:
            file_path = os.path.join(root, filename)

            try:
                # Try reading in UTF-8 first
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read()
            except UnicodeDecodeError:
                try:
                    # If UTF-8 fails, try ISO-8859-1 
                    with open(file_path, "r", encoding="ISO-8859-1") as file:
                        text = file.read()
                except Exception as e:
                    print(f"❌ Error reading file: {file_path} | Skipping... Error: {e}")
                    continue  # Skip the problematic file

            # Append text and label to dataset list
            data.append({"text": text, "label": label})

df = pd.DataFrame(data)

print(df.head())
print(f"Loaded {len(df)} news articles.")



                                                text  label
0     Bud1                                 ...      0
1  The Trump administration is shattering a grues...      0
2                                                ...      0
3  WASHINGTON (AP) — AstraZeneca reported Monday ...      0
4  Donald Trump says the government should get a ...      0
Loaded 17365 news articles.


In [12]:
from collections import Counter

label_counts = Counter(df["label"])

for label, count in label_counts.items():
    print(f"Label {label}: {count} articles")

label_mapping = {0: "Center", 1: "Right", 2: "Left"}
for label, count in label_counts.items():
    print(f"{label_mapping[label]}: {count} articles")


Label 0: 3997 articles
Label 1: 5564 articles
Label 2: 7804 articles
Center: 3997 articles
Right: 5564 articles
Left: 7804 articles


In [13]:

target_size = min(label_counts.values())  # 3997 (Center articles)

# Downsample Right & Left articles
df_center = df[df["label"] == 0] 
df_right = df[df["label"] == 1].sample(target_size, random_state=42)
df_left = df[df["label"] == 2].sample(target_size, random_state=42)

df_balanced = pd.concat([df_center, df_right, df_left]).sample(frac=1, random_state=42)  # Shuffle

print(df_balanced["label"].value_counts())


label
0    3997
2    3997
1    3997
Name: count, dtype: int64


In [14]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_balanced)

print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 11991
})


In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset)


Map: 100%|██████████| 11991/11991 [00:08<00:00, 1424.01 examples/s]

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 11991
})





In [16]:
from datasets import DatasetDict

train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

valid_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": valid_test_split["train"],
    "test": valid_test_split["test"]
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 9592
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1199
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1200
    })
})


In [26]:
import os
import pickle
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments, pipeline, AutoTokenizer

MODEL_PICKLE_FILE = "bias_classifier.pkl"
MODEL_NAME = "roberta-base"

def train_model():
    global classifier, model, tokenizer  # Ensure model and tokenizer are accessible globally

    print("Training the model...")

    model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # Define tokenizer here

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=500,
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"]
    )
    trainer.train()

    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

    with open(MODEL_PICKLE_FILE, "wb") as f:
        pickle.dump(classifier, f)

    print("Model trained and saved as:", MODEL_PICKLE_FILE)

# Load or Train Model
if os.path.exists(MODEL_PICKLE_FILE):
    print("Loading pickled model...")
    with open(MODEL_PICKLE_FILE, "rb") as f:
        classifier = pickle.load(f)
    model = classifier.model
    tokenizer = classifier.tokenizer 
else:
    train_model()

print("Model is ready")



Loading pickled model...
Model is ready


In [23]:

results = trainer.evaluate(dataset["test"])
print("Test set results:", results)


NameError: name 'trainer' is not defined

In [27]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

label_mapping = {
    "LABEL_0": "Center",
    "LABEL_1": "Right",
    "LABEL_2": "Left"
}

# Examples
examples = [
    "The new policy aims to support small businesses through tax incentives.",
    "Government overreach is destroying personal freedoms.",
    "A balanced approach to social programs is necessary.",
    "Tax cuts for the wealthy will improve economic growth, says administration.",
    "Protesters demand stronger action on climate change from the government.",
    "Finally, taxpayer money is being redirected away from these left-wing indoctrination centers. It is encouraging to see that Trump is not just targeting Ivy League schools but extending this crackdown to universities across the board.",
    "The US plans to impose a 25% tariff on steel imports, but UK shares rose instead of falling in response to the news.",
    "After the ceasefire in Gaza, West Bank Palestinians face more Israeli barriers, traffic and misery",
    "Man charged over 'attempted murder of police officer' in Clydebank"

]

for text in examples:
    prediction = classifier(text)
    
    predicted_label = prediction[0]["label"]

    readable_label = label_mapping[predicted_label]

    confidence = prediction[0]["score"]

    print(f"Text: {text}\nPredicted Bias: {readable_label} (Confidence: {confidence:.2f})\n")



Device set to use mps:0


Text: The new policy aims to support small businesses through tax incentives.
Predicted Bias: Left (Confidence: 0.42)

Text: Government overreach is destroying personal freedoms.
Predicted Bias: Right (Confidence: 0.77)

Text: A balanced approach to social programs is necessary.
Predicted Bias: Left (Confidence: 0.48)

Text: Tax cuts for the wealthy will improve economic growth, says administration.
Predicted Bias: Left (Confidence: 0.62)

Text: Protesters demand stronger action on climate change from the government.
Predicted Bias: Left (Confidence: 0.67)

Text: Finally, taxpayer money is being redirected away from these left-wing indoctrination centers. It is encouraging to see that Trump is not just targeting Ivy League schools but extending this crackdown to universities across the board.
Predicted Bias: Right (Confidence: 0.72)

Text: The US plans to impose a 25% tariff on steel imports, but UK shares rose instead of falling in response to the news.
Predicted Bias: Left (Confidenc

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

preds = trainer.predict(dataset["test"])
y_preds = np.argmax(preds.predictions, axis=1)  
y_true = np.array(dataset["test"]["label"])  
accuracy = accuracy_score(y_true, y_preds)
print(f"Test Accuracy: {accuracy:.4f}")

print(classification_report(y_true, y_preds, target_names=["Center", "Right", "Left"]))


In [None]:
import news_signals
import json
import requests

NEWSAPI_APP_KEY = "3fe25605f6f24e2fd93430a4552db8f1"
NEWSAPI_APP_ID = "2e104416"
HEADERS = {
    'X-AYLIEN-NewsAPI-Application-ID': NEWSAPI_APP_ID,
    'X-AYLIEN-NewsAPI-Application-Key': NEWSAPI_APP_KEY
}
                               


In [None]:
params = {
    "published_at": "[1DAY-NOW/DAY TO NOW]",
    "language": "(en)",
    "categories": "{{taxonomy:aylien AND id:(ay.appsci) AND score:>=0.65}}",
    "source.rankings.alexa.rank.min": "1",
    "source.rankings.alexa.rank.max": 100,
    "per_page": 100,
}

response = requests.get(
    url='https://api.aylien.com/v6/news/stories',
    params=params,
    headers=HEADERS
)
result = json.loads(response.content)
for s in result['stories']:
    print(f"Author:{s['author']}")
    print(f"Published At: {s['published_at']}")
    print(f"Title: {s['title']}")
    print(f"Body: {s['body']}") 
    print('-' * 80)

In [None]:
classified_articles = []

for s in result["stories"]:
    article_body = s.get("body", "") 

    if not article_body.strip():  
        continue

    prediction = classifier(article_body[:512])[0]  # Truncate to 512 tokens

    predicted_label = label_mapping[prediction["label"]]
    confidence = round(prediction["score"], 2)

    classified_articles.append({
        "Published At": s["published_at"],
        "Title": s["title"],
        "Bias": predicted_label,
        "Confidence": confidence
    })

df_results = pd.DataFrame(classified_articles)
print(df_results)

df_results.to_csv("news_bias_results.csv", index=False)
print("Results saved to 'news_bias_results.csv'.")