In [1]:
!pip install transformers torch



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm
import torch

print('System Version:', sys.version)
print('PyTorch version', torch.__version__)
print('Numpy version', np.__version__)
print('Pandas version', pd.__version__)

System Version: 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]
PyTorch version 2.4.0+cu121
Numpy version 1.26.4
Pandas version 2.1.4


## Step: Load Dataset

In [3]:
import pandas as pd

# Replace 'your_file.csv' with the name of your uploaded file
df = pd.read_csv('/content/mbic.csv')

# Display the first few rows of the DataFrame
df[["text", "label"]]

Unnamed: 0,text,label
0,The transgender effort to suppress any recogni...,Non-biased
1,Radical Virginia Citizens Defense League has o...,Non-biased
2,Miller is the architect of President Donald Tr...,Non-biased
3,"The House Democrats’ 1,400-page coronavirus re...",Non-biased
4,A specter is haunting the West; our elites see...,Non-biased
...,...,...
17770,Oregon first graders could attend gun safety c...,Biased
17771,But President Donald Trump and his campaign ad...,Biased
17772,The normalization effort included softened rhe...,Biased
17773,"And so, while demonstrations began to spread a...",Biased


In [4]:
texts = df["text"].tolist()  # List of sentences
labels = df["label"].tolist()  # Corresponding labels

In [5]:
# Clean the texts
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

texts = [clean_text(text) for text in texts]

## Step: Prepare the Data

In [6]:
# Split data into train, test, validation
from sklearn.model_selection import train_test_split

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.25, random_state=42)
# 0.25 * 0.8 = 0.2 of the original data goes to validation

In [7]:
# Encode labels into numbers
from sklearn.preprocessing import LabelEncoder

# Convert labels to integers
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Fit on the entire set of labels

# Transform the labels for each dataset split
encoded_train_labels = label_encoder.transform(train_labels)
encoded_val_labels = label_encoder.transform(val_labels)
encoded_test_labels = label_encoder.transform(test_labels)

In [8]:
# Tokenize the texts
from transformers import DistilBertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize each texts set
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [9]:
# Design Dataset to keep data
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = TextDataset(train_encodings, encoded_train_labels)
val_dataset = TextDataset(val_encodings, encoded_val_labels)
test_dataset = TextDataset(test_encodings, encoded_test_labels)

## Step: Training the Model

In [11]:
# Load the pre-trained model
from transformers import DistilBertForSequenceClassification

# Load the BERT model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    learning_rate=1e-5,
    lr_scheduler_type='cosine'
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [13]:
# Train the model
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.6153,0.623433
2,0.6056,0.591291
3,0.6102,0.581972
4,0.521,0.588069


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=1336, training_loss=0.6060324609279633, metrics={'train_runtime': 452.5998, 'train_samples_per_second': 94.255, 'train_steps_per_second': 2.952, 'total_flos': 1158908630468400.0, 'train_loss': 0.6060324609279633, 'epoch': 4.0})

In [14]:
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.5913288593292236, 'eval_runtime': 12.4795, 'eval_samples_per_second': 284.868, 'eval_steps_per_second': 8.975, 'epoch': 4.0}


In [15]:
# Prediction

def predict(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Tokenize the new data
    new_encodings = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')

    # To cuda
    new_encodings = {key: tensor.to(device) for key, tensor in new_encodings.items()}
    model.to(device)

    with torch.no_grad():
        outputs = model(**new_encodings)
        logits = outputs.logits  # Raw predictions (logits)
        # Convert logits to predicted class labels
        predicted_class_ids = torch.argmax(logits, dim=1).to('cpu')
        # Optionally, convert class IDs back to class names
        predicted_labels = label_encoder.inverse_transform(predicted_class_ids.numpy())
    return predicted_labels

# Example usage
prediction = predict("Today, the senator gave a bad briefing to the press.")
print(f"Predicted bias: {prediction}")

Predicted bias: ['Biased']


In [16]:
# Perform predictions on the test dataset
predictions, label_ids, metrics = trainer.predict(test_dataset)

# Convert predictions (logits) to predicted class labels if needed
predicted_class_ids = np.argmax(predictions, axis=1)

# Optionally, if you have a label encoder and want to convert the class IDs back to class names
predicted_labels = label_encoder.inverse_transform(predicted_class_ids)

# Print out the metrics, such as accuracy, precision, recall, etc.
print(metrics)

# Optionally, print out the predictions and true labels for comparison
print("Predictions:", predicted_class_ids)
print("True Labels:", label_ids)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'test_loss': 0.5913288593292236, 'test_runtime': 12.5077, 'test_samples_per_second': 284.225, 'test_steps_per_second': 8.954}
Predictions: [1 0 0 ... 1 0 0]
True Labels: [0 1 0 ... 1 1 0]


In [24]:
# 0 = biased , 1 = non-biased

# ANSI escape codes for colors
RED = "\033[91m"
GREEN = "\033[92m"
RESET = "\033[0m"

correct = 0
to_test = test_texts
test_display_limits = 15

for i in range(len(to_test)):

  # Example with conditional coloring
  true_label = "Biased" if label_ids[i] == 0 else "Non-Biased"
  predicted_label = "Biased" if predicted_class_ids[i] == 0 else "Non-Biased"
  # Color red if the labels don't match, green if they do
  color = GREEN if true_label == predicted_label else RED

  # Detailed test result, but only print a few to not clutter the display
  if i < test_display_limits:
    print(f"{test_texts[i]}")
    print(f"Predict: {color}{predicted_label}{RESET} | True: {color}{true_label}{RESET}\n\n")

  # Correct counter
  if label_ids[i] == predicted_class_ids[i]:
    correct += 1

their work came on the heels of an ipcc report that warned of dire environmental economic and health consequences in the absence of any serious momentum toward decarbonization by 2030
Predict: [91mNon-Biased[0m | True: [91mBiased[0m


the gop controls both the legislature and the michigan state capitol commission  the two entities that could enact a ban  and neither appears willing to act
Predict: [91mBiased[0m | True: [91mNon-Biased[0m


authoritarianism experts who worried about trumps tendencies during his campaign and his first years in office are now sounding fresh alarms following the clearing of a park adjacent to the white house on monday using gas flashbang grenades pepper pellets and other aggressive tactics  all so he could stand in front of a church he does not attend and be photographed holding a bible
Predict: [92mBiased[0m | True: [92mBiased[0m


such a ban would have been of little impact because the key element in such mass attacks is time and the virginia 

In [25]:
print(f"Accuracy: {correct}/{len(to_test)}")
print(f"Percentage: {int( correct/len(to_test)*100 )}/100")

Accuracy: 2413/3555
Percentage: 67/100


In [19]:
# # Zip the results folder
# !zip -r results.zip ./results

# # Download the zipped folder
# from google.colab import files
# files.download('results.zip')