In [None]:
!pip install datasets

from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [1]:
!pip install pandas



In [2]:
import pandas as pd

def load_data_from_csv(file_path):
    data = pd.read_csv(file_path)

    texts = data['Text'].tolist()

    label_mapping = {'left': 0, 'right': 1, 'center': 2}
    labels = data['Leaning'].map(label_mapping).tolist()

    return texts, labels

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [4]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score, classification_report
import pandas as pd
import gc
import numpy as np

class BERT_LSTM_Model(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', lstm_hidden_size=128, num_classes=3):
        super(BERT_LSTM_Model, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=1,
            batch_first=True,
            bidirectional=False
        )
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        lstm_last_hidden_state = lstm_output[:, -1, :]
        logits = self.fc(lstm_last_hidden_state)
        return logits

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

def compute_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "macro_f1": f1,
    }

def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    return avg_loss, accuracy

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    metrics = compute_metrics(preds=all_preds, labels=all_labels)
    return avg_loss, accuracy, metrics, all_preds, all_labels


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BERT_LSTM_Model().to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

chunk_size = 50
file_path = 'preprocessed_final_dataset.csv'
num_epochs = 4
batch_size = 32
max_length = 128

df = pd.read_csv(file_path)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

all_preds, all_labels = [], []

all_val_texts, all_val_labels = [], []

for chunk_idx in range(0, len(df), chunk_size):
    chunk = df.iloc[chunk_idx:chunk_idx + chunk_size]
    print(f"Processing chunk {chunk_idx // chunk_size + 1}...")

    texts = chunk['Text'].tolist()
    labels = chunk['Leaning'].map({'left': 0, 'right': 1, 'center': 2}).tolist()

    print(f"Chunk texts: {texts}")
    print(f"Chunk labels: {labels}")

    split_idx = int(0.8 * len(texts))
    train_texts, val_texts = texts[:split_idx], texts[split_idx:]
    train_labels, val_labels = labels[:split_idx], labels[split_idx:]

    all_val_texts.extend(val_texts)
    all_val_labels.extend(val_labels)

    train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=max_length)
    val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_length=max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc, val_metrics, val_preds, val_labels = evaluate(model, val_loader, criterion, device)

        print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
        print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")
        print(f"Validation Metrics: {val_metrics}")

        all_preds.extend(val_preds)
        all_labels.extend(val_labels)

    # checkpoint_path = f"bert_lstm_chunk_{chunk_idx // chunk_size + 1}.pth"
    # torch.save(model.state_dict(), checkpoint_path)
    print(f"Saved checkpoint to {checkpoint_path}")

    del train_dataset, val_dataset, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

final_val_dataset = NewsDataset(all_val_texts, all_val_labels, tokenizer, max_length=max_length)
final_val_loader = DataLoader(final_val_dataset, batch_size=batch_size)

final_loss, final_acc, final_metrics, final_preds, final_labels = evaluate(model, final_val_loader, criterion, device)

print("\nFinal evaluation on entire validation dataset:")
print(f"Loss: {final_loss:.4f} | Accuracy: {final_acc:.4f}")
print(f"Final Metrics: {final_metrics}")

print("\nFinal report on entire validation dataset:")
print(classification_report(final_labels, final_preds, target_names=["left", "right", "center"]))

print("Training complete")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Validation Metrics: {'accuracy': 0.6, 'precision': 0.5833333333333334, 'recall': 0.6333333333333333, 'macro_f1': 0.5857142857142857}
Epoch 4/4
Train Loss: 0.3312 | Train Accuracy: 0.8750
Validation Loss: 1.2185 | Validation Accuracy: 0.6000
Validation Metrics: {'accuracy': 0.6, 'precision': 0.5833333333333334, 'recall': 0.6333333333333333, 'macro_f1': 0.5857142857142857}
Processing chunk 181...
Chunk texts: ['johnson has traipsed around iowa and new hampshire this winter telling anyone who will listen that bain is not the unblemished job-generating engine that romney makes it out to be they bought my plant in 94', 'biden: fbi agents found classified documents about afghanistan in bidens delaware garage in 2022, along with drafts of a handwritten memo biden sent to president barack obama to persuade obama not to send more troops into the country, hurs report said', 'hello, broward county(applause)inskeep: the clinton event

In [None]:
def predict(text, model):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    model.eval()

    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    label_map = {0: 'left', 1: 'right', 2: 'center'}
    predicted_label = label_map[predictions[0]]

    return predicted_label



In [None]:
texts = [
    "Trump is the best",
    "Biden is the worst",
    "The Republican Party stands for freedom and capitalism.",
    "Democrats are champions of equality and social justice.",
    "Progressive policies are the way forward for America.",
    "Right-wing extremists are the biggest threat to democracy.",
    "Neither Democrats nor Republicans have all the answers.",
    "It's best to vote for the Republican candidate, as he will conserve America's values."
]

predicted_leanings = [predict(text, model) for text in texts]

for text, leaning in zip(texts, predicted_leanings):
    print(f"Text: {text}\nPredicted Leaning: {leaning}\n")


Text: Trump is the best
Predicted Leaning: right

Text: Biden is the worst
Predicted Leaning: right

Text: The Republican Party stands for freedom and capitalism.
Predicted Leaning: left

Text: Democrats are champions of equality and social justice.
Predicted Leaning: left

Text: Progressive policies are the way forward for America.
Predicted Leaning: left

Text: Right-wing extremists are the biggest threat to democracy.
Predicted Leaning: left

Text: Neither Democrats nor Republicans have all the answers.
Predicted Leaning: right

Text: It's best to vote for the Republican candidate, as he will conserve America's values.
Predicted Leaning: left



In [None]:
import os

# Save the trained model to disk
def save_model(model, path='bert_cnn_bilstm_model.pth'):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

save_model(model, path='bert_cnn_bilstm_model.pth')


Model saved to bert_cnn_bilstm_model.pth
