In [None]:
!pip install --quiet torch transformers scikit-learn pandas numpy

In [None]:
# Cell 3: Imports & device check
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
# Cell 5: Load & preprocess data
train_df = pd.read_csv("Training.csv")
test_df  = pd.read_csv("Test.csv")

def preprocess(df):
    # 1) concatenate title + body
    df['full_text'] = df['title'].fillna('') + '. ' + df['text'].fillna('')
    # 2) simple numeric features
    df['text_word_count']    = df['text'].astype(str).map(lambda x: len(x.split()))
    df['heading_word_count'] = df['heading'].astype(str).map(lambda x: len(x.split()))
    return df

train_df = preprocess(train_df)
test_df  = preprocess(test_df)


In [None]:
# Cell 6: Compute BERT embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()

def get_bert_embeddings(texts, batch_size=32):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch,
                        padding=True,
                        truncation=True,
                        max_length=512,
                        return_tensors="pt").to(device)
        with torch.no_grad():
            out = bert_model(**enc)
        embs.append(out.pooler_output.cpu().numpy())
    return np.vstack(embs)

print("Computing training embeddings...")
X_emb_train = get_bert_embeddings(train_df["full_text"].tolist())
print("Computing test embeddings...")
X_emb_test  = get_bert_embeddings(test_df["full_text"].tolist())


In [None]:
# Cell 7: Prepare categorical & numeric features
cat_cols = ["source"]
num_cols = ["text_word_count", "heading_word_count"]

ohe_scaler = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(),             num_cols),
], remainder="drop")

ohe_scaler.fit(train_df)
X_extra_train = ohe_scaler.transform(train_df)
X_extra_test  = ohe_scaler.transform(test_df)

# convert sparse → dense
if hasattr(X_extra_train, "toarray"):
    X_extra_train = X_extra_train.toarray()
    X_extra_test  = X_extra_test.toarray()


In [None]:
# === Cell 8a: Prepare label encoder ===
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_enc = le.fit_transform(train_df["bias_rating"])
y_test_enc  = le.transform(test_df["bias_rating"])

# === Cell 8b: Experiment 1 – With source ( original) ===
X_train_with = np.hstack([X_emb_train, X_extra_train])
X_test_with  = np.hstack([X_emb_test,  X_extra_test])

mlp_with = MLPClassifier(max_iter=100, early_stopping=True, random_state=42)
grid_with = GridSearchCV(
    mlp_with,
    {"hidden_layer_sizes": [(512,256),(256,128)], "alpha":[1e-4,1e-3]},
    cv=3, scoring="f1_macro", verbose=0, n_jobs=-1, error_score="raise"
)
print("Tuning WITH source…")
grid_with.fit(X_train_with, y_train_enc)
y_pred_with = grid_with.best_estimator_.predict(X_test_with)
acc_with = accuracy_score(y_test_enc, y_pred_with)

# === Cell 8c: Experiment 2 – WITHOUT source ===
# 1) Re-build X_extra excluding 'source'
num_cols = ["text_word_count", "heading_word_count"]
from sklearn.preprocessing import StandardScaler

scaler_no = StandardScaler()
X_num_train = scaler_no.fit_transform(train_df[num_cols])
X_num_test  = scaler_no.transform(test_df[num_cols])

# 2) Stack embeddings + numeric only
X_train_no = np.hstack([X_emb_train, X_num_train])
X_test_no  = np.hstack([X_emb_test,  X_num_test])

# 3) Grid-search same MLP
mlp_no = MLPClassifier(max_iter=100, early_stopping=True, random_state=42)
grid_no = GridSearchCV(
    mlp_no,
    {"hidden_layer_sizes": [(512,256),(256,128)], "alpha":[1e-4,1e-3]},
    cv=3, scoring="f1_macro", verbose=0, n_jobs=-1, error_score="raise"
)
print("Tuning WITHOUT source…")
grid_no.fit(X_train_no, y_train_enc)
y_pred_no = grid_no.best_estimator_.predict(X_test_no)
acc_no = accuracy_score(y_test_enc, y_pred_no)

# === Cell 9: Compare results ===
print(f"WITH source:    Accuracy={acc_with:.4f}  BestParams={grid_with.best_params_}")
print(f"WITHOUT source: Accuracy={acc_no:.4f}  BestParams={grid_no.best_params_}\n")

print("Classification Report WITH source:")
print(classification_report(y_test_enc, y_pred_with, target_names=le.classes_))

print("Classification Report WITHOUT source:")
print(classification_report(y_test_enc, y_pred_no,     target_names=le.classes_))


In [None]:
# Cell  Accuracy Comparison Bar Chart
import matplotlib.pyplot as plt

plt.figure()
plt.bar(['With source', 'Without source'], [acc_with, acc_no])
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Accuracy: With vs. Without Source Feature')
plt.tight_layout()
plt.show()



In [None]:
# Cell Per-Class F1-Score Comparison
from sklearn.metrics import precision_recall_fscore_support

# compute per-class F1 for each experiment
prec_w, rec_w, f1_w, _ = precision_recall_fscore_support(y_test_enc, y_pred_with, average=None)
prec_n, rec_n, f1_n, _ = precision_recall_fscore_support(y_test_enc, y_pred_no,    average=None)
classes = le.classes_

x = range(len(classes))
width = 0.35

plt.figure()
plt.bar([i - width/2 for i in x], f1_w, width, label='With source')
plt.bar([i + width/2 for i in x], f1_n, width, label='Without source')
plt.xticks(x, classes)
plt.ylabel('F1 Score')
plt.title('Per-Class F1: With vs. Without Source')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#  Confusion Matrix — With Source
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_enc, y_pred_with)
labels = le.classes_

plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.xticks(range(len(labels)), labels, rotation=45)
plt.yticks(range(len(labels)), labels)
plt.colorbar()
plt.title('Confusion Matrix: With Source')
plt.xlabel('Predicted')
plt.ylabel('True')
# annotate
for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()


In [None]:
# Cell  Confusion Matrix — Without Source
cm = confusion_matrix(y_test_enc, y_pred_no)

plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.xticks(range(len(labels)), labels, rotation=45)
plt.yticks(range(len(labels)), labels)
plt.colorbar()
plt.title('Confusion Matrix: Without Source')
plt.xlabel('Predicted')
plt.ylabel('True')
for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()


In [None]:
# Cell X: Class Distribution Counts & Bar Chart

import matplotlib.pyplot as plt

# 1) Count samples per class in train and test
train_counts = train_df['bias_rating'].value_counts().sort_index()
test_counts  = test_df ['bias_rating'].value_counts().sort_index()

print("Training set class counts:\n", train_counts, "\n")
print("Test set class counts:\n",  test_counts)

# 2) Bar chart for training set
plt.figure(figsize=(6,4))
train_counts.plot(kind='bar')
plt.title('Class Distribution in Training Set')
plt.xlabel('Bias Rating')
plt.ylabel('Number of Samples')
plt.tight_layout()
plt.show()

# 3) Bar chart for test set
plt.figure(figsize=(6,4))
test_counts.plot(kind='bar', color='orange')
plt.title('Class Distribution in Test Set')
plt.xlabel('Bias Rating')
plt.ylabel('Number of Samples')
plt.tight_layout()
plt.show()
