<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/D2_Optimised_Albert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers datasets scikit-learn pandas torch optuna -q

import pandas as pd
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files
import pickle
import optuna

# Upload the datasets
uploaded = files.upload()

# Load datasets
df_fake = pd.read_csv('politifact_fake.csv')
df_real = pd.read_csv('politifact_real.csv')

# Label the datasets
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine and shuffle datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True).sample(frac=1).reset_index(drop=True)

# Optional: Save the combined dataset to a new CSV file
df_combined.to_csv('politifact_combined.csv', index=False)

# Display first few rows and column names
print(df_combined.head())
print(df_combined.columns)

# Specify columns
text_column = 'title'
label_column = 'label'

# Encode labels
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Verify encoding
print("Encoded labels:", label_encoder.classes_)

# Define dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.2,
    random_state=42
)

# Define max token length and create datasets
MAX_LEN = 128
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters to optimize
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 500)

    # Define training arguments with suggested hyperparameters
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        warmup_steps=warmup_steps,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to="none",
    )

    # Initialize Trainer with the current set of hyperparameters
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()

    # Return evaluation metric to be optimized (e.g., eval_loss)
    return eval_result['eval_loss']

# Run the optimization with Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Get the best hyperparameters
best_trial = study.best_trial
print(f"Best trial: {best_trial.params}")

# Train the model with the best hyperparameters
best_params = best_trial.params

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=best_params['warmup_steps'],
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none",
)

# Initialize Trainer with the best hyperparameters
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=[str(label) for label in label_encoder.classes_])

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Save label encoder
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency reso

Saving politifact_real.csv to politifact_real.csv
Saving politifact_fake.csv to politifact_fake.csv
                id                                           news_url  \
0  politifact13949  http://www.neonnettle.com/videos/473-doctors-w...   
1   politifact3632  http://www.nytimes.com/2011/04/18/opinion/18kr...   
2   politifact1313  https://web.archive.org/web/20090913221204/htt...   
3  politifact14927  http://www.bbc.com/news/av/entertainment-arts-...   
4  politifact13815  our.news/2017/08/08/300000-pounds-of-counterfe...   

                                               title  \
0                                        Neon Nettle   
1                                 Let’s Not Be Civil   
2  Briefing by White House Press Secretary Robert...   
3        Oprah Winfrey: The Butler, racism and Obama   
4  300,000 Pounds of Counterfeit Rat Meat Has Bee...   

                                           tweet_ids  label  
0  692733096005914624\t692735522977026048\t692746...      0  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-08-24 09:57:26,161] A new study created in memory with name: no-name-35962978-a28e-49b6-abcd-409501e8594c
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.6502,0.629038
2,0.3728,0.347856


[I 2024-08-24 09:58:11,532] Trial 0 finished with value: 0.34785589575767517 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'learning_rate': 1.48000883698699e-05, 'warmup_steps': 268}. Best is trial 0 with value: 0.34785589575767517.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.2536,0.345412
2,0.2088,0.341106
3,0.1947,0.359329
4,0.1316,0.438651


[I 2024-08-24 09:59:37,425] Trial 1 finished with value: 0.34110620617866516 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'learning_rate': 1.4560295632349633e-05, 'warmup_steps': 408}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.1747,0.370991
2,0.1482,0.437773


[I 2024-08-24 10:00:22,909] Trial 2 finished with value: 0.37099146842956543 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'learning_rate': 3.8344350830890425e-05, 'warmup_steps': 406}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,No log,0.369447
2,0.119800,0.402577
3,0.119800,0.452026
4,0.057000,0.456083


[I 2024-08-24 10:01:47,525] Trial 3 finished with value: 0.3694468140602112 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'learning_rate': 1.2383085880904117e-05, 'warmup_steps': 27}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.1015,0.488322
2,0.1012,0.54242
3,0.112,0.547048
4,0.0899,0.690972
5,0.0995,0.680411


[I 2024-08-24 10:03:37,273] Trial 4 finished with value: 0.4883219301700592 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'learning_rate': 1.8794538344601482e-05, 'warmup_steps': 190}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.0895,0.600027
2,0.2136,0.396256


[I 2024-08-24 10:04:22,602] Trial 5 finished with value: 0.3962557911872864 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'learning_rate': 2.7471994650436337e-05, 'warmup_steps': 98}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,No log,0.393827
2,0.131000,0.434046
3,0.131000,0.472183
4,0.080000,0.530004
5,0.080000,0.599855


[I 2024-08-24 10:06:08,012] Trial 6 finished with value: 0.3938266932964325 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'learning_rate': 1.1339198106291918e-05, 'warmup_steps': 472}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,No log,0.482673
2,0.104200,0.643286
3,0.104200,0.624373
4,0.068300,0.626259


[I 2024-08-24 10:07:32,985] Trial 7 finished with value: 0.4826725721359253 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'learning_rate': 1.6063838955514447e-05, 'warmup_steps': 91}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.0838,0.831246
2,0.0495,0.795893
3,0.1136,0.783611
4,0.0094,0.892995
5,0.0006,0.863195


[I 2024-08-24 10:09:26,277] Trial 8 finished with value: 0.7836108207702637 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'learning_rate': 1.0387793858500975e-05, 'warmup_steps': 189}. Best is trial 1 with value: 0.34110620617866516.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


Epoch,Training Loss,Validation Loss
1,0.0128,0.85854
2,0.0088,0.810739


[I 2024-08-24 10:10:11,588] Trial 9 finished with value: 0.8107392191886902 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'learning_rate': 1.9041025959793654e-05, 'warmup_steps': 468}. Best is trial 1 with value: 0.34110620617866516.


Best trial: {'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'learning_rate': 1.4560295632349633e-05, 'warmup_steps': 408}


Epoch,Training Loss,Validation Loss
1,0.0097,0.851128
2,0.0069,0.872972
3,0.0187,0.825317
4,0.0079,1.106452


Accuracy: 0.8867924528301887
Confusion Matrix:
 [[ 76  10]
 [ 14 112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86        86
           1       0.92      0.89      0.90       126

    accuracy                           0.89       212
   macro avg       0.88      0.89      0.88       212
weighted avg       0.89      0.89      0.89       212

