<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/D2_Optimised_Based_Uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets scikit-learn pandas torch optuna

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from google.colab import files
import optuna

# Upload both the files (fake and real news)
uploaded = files.upload()

# Load the two datasets
df_fake = pd.read_csv('politifact_fake.csv')
df_real = pd.read_csv('politifact_real.csv')

# Add a 'label' column to each dataset
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine the datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Optional: Save the combined dataset to a new CSV file
df_combined.to_csv('politifact_combined.csv', index=False)

# Display the first few rows to understand the structure
print(df_combined.head())

# Inspect the column names to identify the text and label columns
print(df_combined.columns)

# Update these variables with the actual column names in your dataset
text_column = 'title'  # The correct column name for the text data
label_column = 'label'  # The correct column name for the labels

# Encode the labels as integers
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Verify the encoding (optional)
print("Encoded labels:", label_encoder.classes_)

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.2,
    random_state=42
)

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Hyperparameter tuning with Optuna
def model_init():
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_combined[label_column].unique()))

def objective(trial):
    # Define hyperparameters to tune
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=trial.suggest_int('num_train_epochs', 2, 5),
        per_device_train_batch_size=trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32]),
        per_device_eval_batch_size=16,
        warmup_steps=trial.suggest_int('warmup_steps', 0, 500),
        weight_decay=trial.suggest_float('weight_decay', 0.01, 0.1, log=True),
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True)
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Best hyperparameters found by Optuna
print(f"Best hyperparameters: {study.best_params}")

# Train the model with the best hyperparameters
best_params = study.best_params
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=best_params['warmup_steps'],
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=best_params['learning_rate']
)

trainer = Trainer(
    model=model_init(),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Evaluate the model
trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert label classes to strings (necessary for classification_report)
target_names = [str(label) for label in label_encoder.classes_]

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=target_names)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Optional: Save the label encoder for later use
import pickle
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (

Saving politifact_real.csv to politifact_real.csv
Saving politifact_fake.csv to politifact_fake.csv
                id                                           news_url  \
0   politifact4028  http://nobelprize.org/nobel_prizes/peace/laure...   
1  politifact14893  http://info.nct.news/2017/12/30/breaking-fbi-j...   
2  politifact15604    http://www.worldfactsftw.com/nasa-will-pay-you/   
3  politifact15539  http://twentyeightmay.pw/kurt-russel-just-show...   
4  politifact15368  http://nyeveningnews.com/2018/05/09/eric-schne...   

                                               title  \
0                             All Nobel Peace Prizes   
1  BREAKING: FBI Just Raided The White House, 6 P...   
2  NASA Will Pay You $100,000 To Stay In Bed For ...   
3  Kurt Russel Just SHOWED Overpaid Celebs It’s T...   
4  Eric Schneiderman Helped NXIVM Sell Child Sex ...   

                                           tweet_ids  label  
0               52555689318563841\t52746422436630528      1  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

[I 2024-08-24 07:00:24,096] A new study created in memory with name: no-name-ba8146fa-5922-4580-bf81-8eb862fa1f8c


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5046,0.424502
2,0.2981,0.32495
3,0.1065,0.310652
4,0.1945,0.354402


[I 2024-08-24 07:03:14,592] Trial 0 finished with value: 0.3544016182422638 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_steps': 300, 'weight_decay': 0.018534861758659954, 'learning_rate': 0.00017338103186265052}. Best is trial 0 with value: 0.3544016182422638.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3796,0.313578
2,0.3984,0.338241
3,0.3164,0.582152
4,0.3821,0.571322
5,0.4675,0.462552


[I 2024-08-24 07:06:57,340] Trial 1 finished with value: 0.4625524580478668 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_steps': 454, 'weight_decay': 0.020630997999442516, 'learning_rate': 0.00018068604322783749}. Best is trial 0 with value: 0.3544016182422638.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5953,0.516781
2,0.3338,0.332538
3,0.206,0.274946
4,0.2241,0.354598
5,0.0484,0.512375


[I 2024-08-24 07:10:07,815] Trial 2 finished with value: 0.512374997138977 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_steps': 430, 'weight_decay': 0.04485794814303694, 'learning_rate': 9.792448401289902e-05}. Best is trial 0 with value: 0.3544016182422638.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4351,0.343156
2,0.2668,0.326033
3,0.3298,0.306946
4,0.1577,0.443291
5,0.2479,0.669764


[I 2024-08-24 07:13:34,795] Trial 3 finished with value: 0.66976398229599 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_steps': 303, 'weight_decay': 0.017752818111872577, 'learning_rate': 0.0002582509573205438}. Best is trial 0 with value: 0.3544016182422638.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5376,0.345743
2,0.2122,0.271695


[I 2024-08-24 07:14:58,200] Trial 4 finished with value: 0.2716948091983795 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'warmup_steps': 148, 'weight_decay': 0.025529584347071255, 'learning_rate': 0.000462384693258056}. Best is trial 4 with value: 0.2716948091983795.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.8775,0.535404
2,0.6795,0.539785
3,0.7122,0.706616
4,0.6883,0.684416
5,0.7067,0.686926


[I 2024-08-24 07:18:49,653] Trial 5 finished with value: 0.6869261264801025 and parameters: {'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_steps': 81, 'weight_decay': 0.08791787840411223, 'learning_rate': 0.00020551785525472674}. Best is trial 4 with value: 0.2716948091983795.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4078,0.315746
2,0.4134,0.39761
3,0.7068,0.646245


[I 2024-08-24 07:20:51,480] Trial 6 finished with value: 0.6462451815605164 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_steps': 144, 'weight_decay': 0.04861898823751357, 'learning_rate': 0.0003843299404604821}. Best is trial 4 with value: 0.2716948091983795.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6045,0.435258
2,0.2666,0.279287
3,0.3089,0.323035


[I 2024-08-24 07:22:50,768] Trial 7 finished with value: 0.3230353891849518 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'warmup_steps': 214, 'weight_decay': 0.04576678169192153, 'learning_rate': 0.00033554648639202533}. Best is trial 4 with value: 0.2716948091983795.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3903,0.359413
2,0.2369,0.420959
3,0.2727,0.488317
4,0.318,0.520602


[I 2024-08-24 07:25:45,856] Trial 8 finished with value: 0.5206022262573242 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_steps': 7, 'weight_decay': 0.021082094799566797, 'learning_rate': 0.00013824139621917194}. Best is trial 4 with value: 0.2716948091983795.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5584,0.489243
2,0.2764,0.277255
3,0.1624,0.395817
4,0.1309,0.452435


[I 2024-08-24 07:28:29,922] Trial 9 finished with value: 0.4524351954460144 and parameters: {'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_steps': 178, 'weight_decay': 0.01834536245356989, 'learning_rate': 5.781675898922756e-05}. Best is trial 4 with value: 0.2716948091983795.


Best hyperparameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'warmup_steps': 148, 'weight_decay': 0.025529584347071255, 'learning_rate': 0.000462384693258056}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5493,0.391418
2,0.2383,0.302896


Accuracy: 0.8915094339622641
Confusion Matrix:
 [[ 74  18]
 [  5 115]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.80      0.87        92
           1       0.86      0.96      0.91       120

    accuracy                           0.89       212
   macro avg       0.90      0.88      0.89       212
weighted avg       0.90      0.89      0.89       212

