<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/Optimised_Albert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The key optimizations made include:

1. **Stratified K-Fold Cross-Validation**: Ensures balanced class distribution in training and validation splits.
2. **Increased Epochs**: Extends training time for better model fine-tuning.
3. **Evaluation Strategy**: Added F1 score tracking and configured the model to save and load the best version based on validation performance.
4. **Advanced Metric Calculation**: Focused on weighted F1 score, which is crucial for imbalanced datasets, alongside accuracy.

In [1]:
# Install required libraries
!pip install transformers datasets scikit-learn

import pandas as pd
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np

# Load your dataset
file_path = '/content/Constraint_Train.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(df.head())

# Inspect the column names to identify the text and label columns
text_column = 'tweet'  # Replace with the actual text column name
label_column = 'label'  # Replace with the actual label column name

# Encode the labels as integers
label_encoder = LabelEncoder()
df[label_column] = label_encoder.fit_transform(df[label_column])

# Verify the encoding (optional)
print("Encoded labels:", label_encoder.classes_)

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Split the dataset into train and validation sets using StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_index, val_index = next(skf.split(df[text_column].values, df[label_column].values))

train_texts, val_texts = df[text_column].values[train_index], df[text_column].values[val_index]
train_labels, val_labels = df[label_column].values[train_index], df[label_column].values[val_index]

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Load pre-trained ALBERT model for sequence classification
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(df[label_column].unique()))

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased epochs for better fine-tuning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,  # Load the best model based on validation loss
    metric_for_best_model="eval_f1",  # Optimize for F1 score
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'f1': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted'),
    }
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
f1 = f1_score(val_labels, preds, average='weighted')
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1681,0.243879,0.926791,0.92676
2,0.229,0.166424,0.943925,0.943947
3,0.2296,0.154219,0.964174,0.964152
4,0.0468,0.211355,0.961838,0.961827
5,0.0003,0.207836,0.965732,0.965724


Accuracy: 0.9657320872274143
F1 Score: 0.965723834418646
Confusion Matrix:
 [[587  25]
 [ 19 653]]
Classification Report:
               precision    recall  f1-score   support

        fake       0.97      0.96      0.96       612
        real       0.96      0.97      0.97       672

    accuracy                           0.97      1284
   macro avg       0.97      0.97      0.97      1284
weighted avg       0.97      0.97      0.97      1284

