# Importing Necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random
import optuna

# Load the Dataset

In [2]:
# Load the dataset
data = pd.read_excel(r"C:\Users\alish\OneDrive\Documents\Alishbah\DASC5309_DATA SCIENCE CAPSTONE PROJECT\dataset\classification.xlsx")

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Sentence,Class
0,Chen Kaige followed up the Unprecedented succe...,yes
1,Promoted by Wakefield Poole with an advertisin...,yes
2,"the Jean Renoir film ""La Grande Illusion"", an ...",yes
3,Sing Your Song shows not only Harry Belafonte'...,no
4,What makes Jennifer Connelly so Remarkable isn...,no


# DistilBERT

In [3]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, SubsetRandomSampler, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define your custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

data = pd.DataFrame(data)

# Convert class labels to integers
label_to_id = {'yes': 1, 'no': 0}
data['Label'] = data['Class'].map(label_to_id)

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.15, random_state=42)

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the training and testing data
train_encodings = tokenizer(list(train_data['Sentence']), truncation=True, padding=True, return_tensors="pt")
train_labels = torch.tensor(train_data['Label'].values)

test_encodings = tokenizer(list(test_data['Sentence']), truncation=True, padding=True, return_tensors="pt")
test_labels = torch.tensor(test_data['Label'].values)

# Convert to torch datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Define ranges for hyperparameters
learning_rates = [1e-5, 2e-5, 3e-5]
batch_sizes = [16, 32, 64]
num_epochs_options = [3, 5, 10]

# Initialize a list to store the results
results = []

for lr in learning_rates:
    for batch_size in batch_sizes:
        for num_epochs in num_epochs_options:
            # Initialize the model
            model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
            model.to(device)

            # Initialize optimizer
            optimizer = AdamW(model.parameters(), lr=lr)

            # Define KFold Cross-Validation
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)

            # Initialize metrics
            fold_metrics = {
                'accuracy': [],
                'precision': [],
                'recall': [],
                'f1_score': []
            }

            for fold, (train_ids, val_ids) in enumerate(kfold.split(train_dataset)):
                # Data loaders for the current fold
                train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=SubsetRandomSampler(train_ids))
                val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=SubsetRandomSampler(val_ids))

                # Training loop
                for epoch in range(num_epochs):
                    model.train()
                    for batch in train_loader:
                        batch = {k: v.to(device) for k, v in batch.items()}
                        outputs = model(**batch)
                        loss = outputs.loss
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                # Evaluation loop
                model.eval()
                predictions = []
                real_values = []
                with torch.no_grad():
                    for batch in val_loader:
                        batch = {k: v.to(device) for k, v in batch.items()}
                        outputs = model(**batch)
                        logits = outputs.logits
                        predictions.extend(torch.argmax(logits, dim=1).tolist())
                        real_values.extend(batch['labels'].tolist())

                # Calculate metrics
                accuracy = accuracy_score(real_values, predictions)
                precision = precision_score(real_values, predictions, average='weighted')
                recall = recall_score(real_values, predictions, average='weighted')
                f1 = f1_score(real_values, predictions, average='weighted')

                fold_metrics['accuracy'].append(accuracy)
                fold_metrics['precision'].append(precision)
                fold_metrics['recall'].append(recall)
                fold_metrics['f1_score'].append(f1)

            # Calculate average metrics across folds
            avg_metrics = {metric: sum(values) / len(values) for metric, values in fold_metrics.items()}

            # Store the results
            result = {
                'learning_rate': lr,
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'avg_accuracy': avg_metrics['accuracy'],
                'avg_precision': avg_metrics['precision'],
                'avg_recall': avg_metrics['recall'],
                'avg_f1_score': avg_metrics['f1_score']
            }
            results.append(result)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-strea

In [6]:
import torch
import numpy as np
import random
from torch.utils.data import DataLoader, SubsetRandomSampler, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
import pandas as pd

# Set seed for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Define the objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    num_epochs = trial.suggest_categorical('num_epochs', [3, 5, 10])

    # Prepare data
    label_to_id = {'yes': 1, 'no': 0}
    data['Label'] = data['Class'].map(label_to_id)
    train_data, test_data = train_test_split(data, test_size=0.15, random_state=42)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(list(train_data['Sentence']), truncation=True, padding=True, return_tensors="pt")
    train_labels = torch.tensor(train_data['Label'].values)

    train_dataset = TextDataset(train_encodings, train_labels)

    # KFold Cross-Validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_list = []

    # Set the seed and device
    set_seed()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_to_id))
    model.to(device)

    for fold, (train_ids, val_ids) in enumerate(kfold.split(train_dataset)):
        # Data loaders for the current fold
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=SubsetRandomSampler(train_ids))
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=SubsetRandomSampler(val_ids))

        # Model and optimizer
        optimizer = AdamW(model.parameters(), lr=lr)

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            for batch in train_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        predictions = []
        real_values = []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                predictions.extend(torch.argmax(outputs.logits, dim=1).tolist())
                real_values.extend(batch['labels'].tolist())

        # Calculate accuracy for the current fold
        accuracy = accuracy_score(real_values, predictions)
        accuracy_list.append(accuracy)

    # Return the average accuracy over the folds
    return np.mean(accuracy_list)

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # Adjust the number of trials as necessary

# Print the results
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-11-25 18:06:07,325] A new study created in memory with name: no-name-dfa1211c-61d9-4556-abbf-04b72d4eb879
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-11-25 19:18:10,728] Trial 0 finished with value: 0.5291666666666667 and parameters: {'lr': 0.0003192813554104076, 'batch_size': 16, 'num_epochs': 10}. Best is trial 0 with value: 0.5291666666666667.
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_cl

Number of finished trials: 10
Best trial: {'lr': 0.00010462147123907267, 'batch_size': 16, 'num_epochs': 10}
