In this workbook, I preprocess and clean text data, vectorize it using TF-IDF, and train a Logistic Regression model to predict the winning response between two models. Additionally, I fine-tune a BERT model for the same task. The purpose of this workbook is to compare the performance of traditional machine learning models with transformer-based models in the context of large language models (LLMs) projects. I use log loss as the evaluation metric to measure the performance of our models, ensuring that our predictions are probabilistically accurate.

# Libraries

In [None]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [None]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Load train Data

In [None]:
train_data = pd.read_parquet('train.parquet', engine='pyarrow')
train_data.head()

In [None]:
def clean_text(text):
    
    '''
    Cleans the input text by performing the following steps:
    1. Converts text to lowercase.
    2. Tokenizes the text into words.
    3. Removes punctuation and non-alphabetic tokens.
    4. Removes stopwords.
    5. Lemmatizes the tokens.
    6. Joins the tokens back into a single string.

    Parameters:
    text (str): The input text to be cleaned.

    Returns:
    str: The cleaned text.
    '''

    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()] # Remove punctuation and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    clean_text = ' '.join(tokens)
    return clean_text

In [None]:
# clean the text data
train_data['prompt'] = train_data['prompt'].apply(clean_text)
train_data['response_a'] = train_data['response_a'].apply(clean_text)
train_data['response_b'] = train_data['response_b'].apply(clean_text)

In [None]:
train_data.head()

In [None]:
# Combine responses for TF-IDF vectorization
train_data['combined_responses'] = train_data['response_a'] + " " + train_data['response_b']

In [None]:
train_data[['response_a', 'response_b', 'combined_responses']].head()

In [None]:
# Sample the data
sampled_data = train_data.sample(n=10000, random_state=42)

In [None]:
sampled_data

In [None]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sampled_data['combined_responses'])

In [None]:
# Target variable
sampled_data[['winner_model_a', 'winner_model_b', 'winner_tie']].values

In [None]:
# Encode target variable
sampled_data['winner'] = sampled_data[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1)
sampled_data['winner'] = sampled_data['winner'].map({'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2})

In [None]:
# target variable
y = sampled_data['winner'].values

In [None]:
# train_test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check the shapes
X_train.shape, X_val.shape, y_train.shape, y_val.shape

### Model 1: Logistic Regression

In [None]:
# Initialize the model
model_LR = LogisticRegression(multi_class='multinomial', max_iter=1000)

# Train the model
model_LR.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
y_pred_LR = model_LR.predict_proba(X_val)

In [None]:
# Calculate log loss
log_loss_score_LR = log_loss(y_val, y_pred_LR)
print(f'Log Loss: {log_loss_score_LR}')

In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [None]:
# Tokenize the text data
def encode_data(texts):
    """
    Tokenizes the input texts using the BERT tokenizer.

    Parameters:
    texts (list of str): List of input texts to be tokenized.

    Returns:
    dict: A dictionary containing tokenized input ids, attention masks, and token type ids.
    """
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Tokenize the combined responses from the sampled data
train_encodings = encode_data(sampled_data['combined_responses'].tolist())

# Convert the winner column to a tensor of labels
labels = torch.tensor(sampled_data['winner'].values, dtype=torch.long)

In [None]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(sampled_data['combined_responses'], labels, test_size=0.2, random_state=42)

# Tokenize the split text data
train_encodings = encode_data(train_texts.tolist())
val_encodings = encode_data(val_texts.tolist())

In [None]:
# Define a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the CustomDataset with encodings and labels.

        Parameters:
        encodings (dict): Encoded input data.
        labels (torch.Tensor): Corresponding labels for the input data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Retrieves the item (encoding and label) at the specified index.

        Parameters:
        idx (int): Index of the item to retrieve.

        Returns:
        dict: A dictionary containing the encoding and label for the specified index.
        """
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        """
        Returns the number of items in the dataset.
        
        """
        return len(self.labels)

In [None]:
#  Create a CustomDataset object for the training and validation sets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
trainer.evaluate()

### Test Data

In [None]:
# Load and preprocess test data
test_data = pd.read_parquet('test.parquet', engine='pyarrow')

In [None]:
# Clean the text data
test_data['prompt'] = test_data['prompt'].apply(clean_text)
test_data['response_a'] = test_data['response_a'].apply(clean_text)
test_data['response_b'] = test_data['response_b'].apply(clean_text)
test_data['combined_responses'] = test_data['response_a'] + " " + test_data['response_b']

In [None]:
# Vectorize test data
X_test = vectorizer.transform(test_data['combined_responses'])

In [None]:
# Make predictions on the test data using Logistic Regression model
y_test_pred_LR = model_LR.predict_proba(X_test)
print(f'Test Predictions (Logistic Regression): {y_test_pred_LR}')

In [None]:
# Tokenize the test data
test_encodings = encode_data(test_data['combined_responses'].tolist())

In [None]:
# Move test encodings to device
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

In [None]:
# Make predictions on the test data using BERT model
with torch.no_grad():
    model.eval()
    outputs = model(**test_encodings)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

In [None]:
# Prepare the submission file
submission = pd.DataFrame(predictions, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission.insert(0, 'id', test_data['id'])

In [None]:
# Save the submission file
submission.to_csv('submission.csv', index=False)
print('Submission file created!')