<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/D2_Optimised_DistilBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers datasets scikit-learn pandas torch optuna

import pandas as pd
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from google.colab import files

# Upload both the files (fake and real news)
uploaded = files.upload()

# Load the two datasets
df_fake = pd.read_csv('politifact_fake.csv')
df_real = pd.read_csv('politifact_real.csv')

# Add a 'label' column to each dataset
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine the datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Update these variables with the actual column names in your dataset
text_column = 'title'  # The correct column name for the text data
label_column = 'label'  # The correct column name for the labels

# Encode the labels as integers
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.2,
    random_state=42
)

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)

    # Load pre-trained DistilBERT model for sequence classification
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df_combined[label_column].unique()))

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=learning_rate,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()
    return eval_result['eval_loss']  # Or any other metric like 'eval_accuracy'

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')  # or 'maximize' depending on the metric
study.optimize(objective, n_trials=10)

# Get the best trial
best_trial = study.best_trial

# Output the best hyperparameters
print(f"Best trial: {best_trial.values}")
print(f"Best hyperparameters: {best_trial.params}")


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (

Saving politifact_real.csv to politifact_real.csv
Saving politifact_fake.csv to politifact_fake.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

[I 2024-08-24 10:33:08,063] A new study created in memory with name: no-name-614de31f-4e89-481f-98e2-9af7730a1635
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6713,0.668791
2,0.5915,0.58088
3,0.4286,0.419224


[I 2024-08-24 10:33:47,456] Trial 0 finished with value: 0.41922420263290405 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 1.9036627018217643e-05}. Best is trial 0 with value: 0.41922420263290405.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7014,0.686648
2,0.6639,0.642829
3,0.5662,0.537549


[I 2024-08-24 10:34:22,732] Trial 1 finished with value: 0.5375490784645081 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'learning_rate': 4.888697435585204e-05}. Best is trial 0 with value: 0.41922420263290405.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7016,0.687283
2,0.666,0.645724
3,0.5751,0.547061


[I 2024-08-24 10:34:53,923] Trial 2 finished with value: 0.5470612645149231 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'learning_rate': 4.647282185501742e-05}. Best is trial 0 with value: 0.41922420263290405.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6989,0.690213
2,0.6647,0.655656
3,0.583,0.569851
4,0.431,0.45704
5,0.3202,0.366996


[I 2024-08-24 10:35:48,264] Trial 3 finished with value: 0.3669959604740143 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'learning_rate': 1.172087290667836e-05}. Best is trial 3 with value: 0.3669959604740143.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6776,0.67616
2,0.6065,0.593239
3,0.4219,0.402103
4,0.2531,0.360747
5,0.2,0.364857


[I 2024-08-24 10:36:44,078] Trial 4 finished with value: 0.36485710740089417 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'learning_rate': 2.0759641144958267e-05}. Best is trial 4 with value: 0.36485710740089417.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6958,0.689293
2,0.6843,0.681003


[I 2024-08-24 10:37:07,168] Trial 5 finished with value: 0.6810032725334167 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'learning_rate': 1.0988920034798684e-05}. Best is trial 4 with value: 0.36485710740089417.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6643,0.670154
2,0.5752,0.570159
3,0.4127,0.422873


[I 2024-08-24 10:37:42,588] Trial 6 finished with value: 0.4228734076023102 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 2.321707798871539e-05}. Best is trial 4 with value: 0.36485710740089417.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6302,0.621832
2,0.3457,0.366029


[I 2024-08-24 10:38:14,743] Trial 7 finished with value: 0.3660292625427246 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'learning_rate': 2.5563689615302023e-05}. Best is trial 4 with value: 0.36485710740089417.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6886,0.685981
2,0.6765,0.673574
3,0.6497,0.641467
4,0.6062,0.583537
5,0.5154,0.509659


[I 2024-08-24 10:39:09,062] Trial 8 finished with value: 0.5096594095230103 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'learning_rate': 1.5382963335770125e-05}. Best is trial 4 with value: 0.36485710740089417.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6606,0.651997
2,0.4474,0.41506


[I 2024-08-24 10:39:36,367] Trial 9 finished with value: 0.4150596261024475 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'learning_rate': 4.731849927359191e-05}. Best is trial 4 with value: 0.36485710740089417.


Best trial: [0.36485710740089417]
Best hyperparameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'learning_rate': 2.0759641144958267e-05}


In [3]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Load the best model
best_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df_combined[label_column].unique()))

# Set up training arguments with the best hyperparameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2.0759641144958267e-05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize Trainer with the best model and the best training arguments
trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Generate predictions on the validation set
val_preds = trainer.predict(val_dataset)
val_preds_labels = np.argmax(val_preds.predictions, axis=1)

# Calculate evaluation metrics
accuracy = accuracy_score(val_labels, val_preds_labels)
classification_rep = classification_report(val_labels, val_preds_labels, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(val_labels, val_preds_labels)

# Output results
print(f"Evaluation Loss: {eval_result['eval_loss']}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(conf_matrix)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.666,0.67251
2,0.5894,0.584286
3,0.435,0.439928
4,0.2813,0.367816
5,0.182,0.369681


TypeError: object of type 'numpy.int64' has no len()

In [4]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Load the best model
best_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df_combined[label_column].unique()))

# Set up training arguments with the best hyperparameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2.0759641144958267e-05,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize Trainer with the best model and the best training arguments
trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Generate predictions on the validation set
val_preds = trainer.predict(val_dataset)
val_preds_labels = np.argmax(val_preds.predictions, axis=1)

# Convert label classes to a list of strings
target_names = [str(label) for label in label_encoder.classes_]

# Calculate evaluation metrics
accuracy = accuracy_score(val_labels, val_preds_labels)
classification_rep = classification_report(val_labels, val_preds_labels, target_names=target_names)
conf_matrix = confusion_matrix(val_labels, val_preds_labels)

# Output results
print(f"Evaluation Loss: {eval_result['eval_loss']}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(conf_matrix)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7063,0.682724
2,0.5942,0.58388
3,0.4464,0.456755
4,0.3013,0.379394
5,0.186,0.356993


Evaluation Loss: 0.35699260234832764
Accuracy: 0.8301886792452831
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81        90
           1       0.89      0.80      0.84       122

    accuracy                           0.83       212
   macro avg       0.83      0.83      0.83       212
weighted avg       0.84      0.83      0.83       212

Confusion Matrix:
[[78 12]
 [24 98]]
