<a href="https://colab.research.google.com/github/Aditya-Walia1/Bert-Research/blob/main/D4_Bertweet_Optimised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna


In [None]:
# Install necessary libraries
!pip install transformers datasets scikit-learn pandas torch emoji==0.6.0 optuna

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import optuna
import pickle

# Load the datasets (assuming the CSV files are named 'Fake.csv' and 'True.csv')
df_fake = pd.read_csv('Fake.csv')
df_real = pd.read_csv('True.csv')

# Add a 'label' column to each dataset
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine the datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Save the combined dataset to a new CSV file (optional)
df_combined.to_csv('combined_news.csv', index=False)

# Display the first few rows to understand the structure
print(df_combined.head())

# Inspect the column names to identify the text and label columns
print(df_combined.columns)

# Update these variables with the actual column names in your dataset
text_column = 'text'  # Assuming the text column is named 'text'
label_column = 'label'  # The correct column name for the labels

# Encode the labels as integers
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Verify the encoding (optional)
print("Encoded labels:", label_encoder.classes_)

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load BERTweet tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base', use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=len(df_combined[label_column].unique()))

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.2,
    random_state=42
)

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32])

    # Update the TrainingArguments with Optuna's suggestions
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        learning_rate=learning_rate
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    # Train and evaluate the model
    trainer.train()
    eval_result = trainer.evaluate()

    # Use accuracy as the metric for optimization
    preds = trainer.predict(val_dataset).predictions.argmax(-1)
    accuracy = accuracy_score(val_labels, preds)

    return accuracy

# Run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # You can increase n_trials for a more thorough search

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_trial.params)

# Train the final model with the best hyperparameters
best_params = study.best_trial.params

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=best_params['learning_rate']
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the final model
trainer.train()

# Evaluate the final model
eval_result = trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert label classes to strings (necessary for classification_report)
target_names = [str(label) for label in label_encoder.classes_]

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=target_names)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Optional: Save the label encoder for later use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2024-08-28 09:31:48,617] A new study created in memory with name: no-name-7d56a2ad-9db2-4f91-8e48-bef726a123be


Epoch,Training Loss,Validation Loss
1,0.0002,0.004142
2,0.0005,0.001831
3,0.0,8.6e-05


[I 2024-08-28 09:49:14,677] Trial 0 finished with value: 1.0 and parameters: {'learning_rate': 3.296808487005066e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0001,6.8e-05
2,0.0,0.000584


[I 2024-08-28 10:00:08,098] Trial 1 finished with value: 0.9998886414253898 and parameters: {'learning_rate': 2.2332540380095077e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0,0.001504
2,0.0,6e-06
3,0.0,7e-06


[I 2024-08-28 10:15:54,773] Trial 2 finished with value: 1.0 and parameters: {'learning_rate': 1.1293934716541742e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0001,0.004413
2,0.0001,0.004142
3,0.0,0.001225
4,0.0499,0.001297


[I 2024-08-28 10:38:23,832] Trial 3 finished with value: 0.9998886414253898 and parameters: {'learning_rate': 3.5693012747017084e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0,0.001804
2,0.0,0.001832


[I 2024-08-28 10:50:12,151] Trial 4 finished with value: 0.9998886414253898 and parameters: {'learning_rate': 1.645337205192521e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.3561,0.318054
2,0.6636,0.686929
3,0.0005,0.010772
4,0.1173,0.005262


[I 2024-08-28 11:16:43,603] Trial 5 finished with value: 0.9994432071269488 and parameters: {'learning_rate': 3.904778462342574e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0002,0.005236
2,0.0003,0.005636
3,0.0002,0.006077
4,0.0284,0.005149
5,0.0001,0.005209


[I 2024-08-28 11:42:13,147] Trial 6 finished with value: 0.9994432071269488 and parameters: {'learning_rate': 1.7412580419909152e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0336,0.005524
2,0.0,0.005674


[I 2024-08-28 11:53:10,649] Trial 7 finished with value: 0.9994432071269488 and parameters: {'learning_rate': 1.3209601002400526e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0,0.006835
2,0.0,0.005069


[I 2024-08-28 12:07:08,082] Trial 8 finished with value: 0.999554565701559 and parameters: {'learning_rate': 1.5558682443526842e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.0.


Epoch,Training Loss,Validation Loss
1,0.0078,0.014946
2,0.0245,0.008839
3,0.1441,1.270069
4,0.698,0.68388
