<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/D3_Optimised_albert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets scikit-learn pandas torch

import pandas as pd
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files
import numpy as np
import random

# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Upload both the files (fake and real news)
uploaded = files.upload()

# Load the two datasets
df_fake = pd.read_csv('gossipcop_fake.csv')
df_real = pd.read_csv('gossipcop_real.csv')

# Add a 'label' column to each dataset
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine the datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Optional: Save the combined dataset to a new CSV file
df_combined.to_csv('gossipcop_combined.csv', index=False)

# Display the first few rows to understand the structure
print(df_combined.head())

# Inspect the column names to identify the text and label columns
print(df_combined.columns)

# Update these variables with the actual column names in your dataset
text_column = 'title'  # The correct column name for the text data
label_column = 'label'  # The correct column name for the labels

# Encode the labels as integers
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Verify the encoding (optional)
print("Encoded labels:", label_encoder.classes_)

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len): # Use __init__ instead of _init_
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): # Use __len__ instead of _len_
        return len(self.texts)

    def __getitem__(self, idx): # Use __getitem__ instead of _getitem_
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
# Load ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.15,  # Reduced to give more data to training
    random_state=42,
    stratify=df_combined[label_column]  # Ensures balanced distribution of classes
)

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Load pre-trained ALBERT model for sequence classification
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,  # Increased for more training
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=32,  # Increased eval batch size
    warmup_steps=0,  # Reduced warmup steps for quicker training
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,  # Log less frequently
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save checkpoints after every epoch
    load_best_model_at_end=True,  # Load the best model based on validation loss
    metric_for_best_model="accuracy",  # Use accuracy to determine best model
    greater_is_better=True,  # Indicates higher metric is better
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))}
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# Convert label classes to strings (necessary for classification_report)
target_names = [str(label) for label in label_encoder.classes_]

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=target_names)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Optional: Save the label encoder for later use
import pickle
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)



Saving gossipcop_fake.csv to gossipcop_fake (1).csv
Saving gossipcop_real.csv to gossipcop_real (1).csv
                     id                                           news_url  \
0      gossipcop-900533  https://stylecaster.com/beauty/lea-michele-hai...   
1  gossipcop-1828135129  www.cbsnews.com/news/thomas-markle-prince-harr...   
2      gossipcop-899849  https://variety.com/2018/film/news/2019-sag-aw...   
3      gossipcop-939062  https://qz.com/1289668/meghan-markles-coat-of-...   
4      gossipcop-878772  https://www.dailymail.co.uk/tvshowbiz/article-...   

                                               title  \
0  Lea Michele’s Hairstylist’s Mixes Texture Spra...   
1  Thomas Markle on Prince Harry, politics, and m...   
2  2019 SAG Award Nominations: See Full List of N...   
3  See Meghan Markle’s royal coat of arms and all...   
4  Kylie Jenner visits shaman in Life of Kylie se...   

                                           tweet_ids  label  
0  943109597073756160\t94311

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3632,0.420989,0.812707
2,0.3395,0.410719,0.834989
3,0.2631,0.376148,0.85125
4,0.1369,0.44027,0.847636


Accuracy: 0.8512496236073472
Confusion Matrix:
 [[ 496  302]
 [ 192 2331]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.62      0.67       798
           1       0.89      0.92      0.90      2523

    accuracy                           0.85      3321
   macro avg       0.80      0.77      0.79      3321
weighted avg       0.85      0.85      0.85      3321

