In [17]:
# Import necessary libraries
import numpy as np
import time
import datetime
import gc
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification

In [2]:
# Mount Google Drive to access data stored in it
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/My Drive/Hackthon/train.csv').dropna()

In [6]:
# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,id,Date,review ID,reviewer ID,product ID,rating_Helpful,rating_Thanks,rating_LoveThis,rating_OhNo,reviews,Label
0,0,5/17/2009,0dFa6egshOwhusL8aSMw-Q,8GC6cFcby0stKarnzL9i2w,dKcO9OQ44RPRlkWe-vToFA,0,0,0,4,Just got back from Shaw's. Great oysters. They...,Y
1,1,10/25/2011,htQgJ_Z0ADA_QHeKthfeFw,88KSdQ5IMdpCkOidmq1udg,NkOir65b_YAAQVlJR_zmJA,0,0,0,2,Back from friday & saturday nite stays in King...,Y
2,2,8/23/2009,2RsvT8p0SuAC25bhAi3EIw,bMKlvA-zWF4jU3OJCVbVlA,cQnY_VneZisfUAqcbuEuKg,0,0,0,5,It is a beautiful Saturday afternoon and my wi...,N
3,3,10/28/2011,LM-zONQMUNnAuf6NBISrow,9DMoXd0afrTIdpcwcDDVsw,WBU0yq9J8qiYQfI_fh2P1Q,0,1,1,5,A friend told me about this place but I have t...,N
4,4,6/18/2010,-DoQeDcNYFdmhOYcgx2MjQ,PyUn2FeMuLdmyB6xxMe4NA,-pO0hsi0xlF4FwqLGJUizg,0,2,0,5,I went to Uncommon Ground for brunch on a Sund...,N


In [7]:
# Mapping 'N' to 0 and 'Y' to 1
df['Label'] = df['Label'].map({'N': 0, 'Y': 1})

In [8]:
# Ensure that the 'reviews' column is of string type
# This is important for text processing and model training
df['reviews'] = df['reviews'].astype(str)

In [9]:
# Check if CUDA (GPU) is available; if not, use the CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [10]:
# Extract the reviews and labels from the dataframe
# The reviews will be used as input to the model, and the labels will be used as targets
tweets = df["reviews"].values
labels = df["Label"].values

In [11]:
# Load the RoBERTa tokenizer
# We can change the tokenizer as needed, for example, to BERT or DistilBERT
# For BERT: BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# For DistilBERT: DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [14]:
input_ids = []
attention_masks = []


for tweet in tweets:

    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )


    input_ids.append(encoded_dict['input_ids'])


    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  Just got back from Shaw's. Great oysters. They have like 12 varieties to choose from. We sat at the bar in back and enjoyed 1/2 off oysters between 4 - 6 PM. Great deal.

Token IDs: tensor([    0,  6785,   300,   124,    31,  8390,    18,     4,  2860, 23136,
         9230,     4,   252,    33,   101,   316, 17576,     7,  2807,    31,
            4,   166,  4005,    23,     5,  2003,    11,   124,     8,  3776,
          112,    73,   176,   160, 23136,  9230,   227,   204,   111,   231,
         2784,     4,  2860,   432,     4, 50118,     2,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1

In [15]:
# Create a TensorDataset combining input IDs, attention masks, and labels
# This dataset will be used for training and evaluation
dataset = TensorDataset(input_ids, attention_masks, labels)

# Define the size for the training and validation sets
# In this case, we use 90% of the dataset for training and the remaining for validation
train_size = int(0.9 * len(dataset))  # 90% of the dataset for training
val_size = len(dataset) - train_size    # Remaining data for validation

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Print the number of samples in the training and validation sets
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Note: For the final submission, we will use the entire dataset for training
# The validation split is only for model evaluation during development

42,458 training samples
4,718 validation samples


In [16]:
# Define the batch size for training and evaluation
batch_size = 64

# Create the DataLoader for the training dataset
# Uses a RandomSampler to shuffle the data and batches the data into the specified batch size
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),  # Randomly shuffle the training data
    batch_size=batch_size                   # Number of samples per batch
)

# Create the DataLoader for the validation dataset
# Uses a SequentialSampler to pull out batches sequentially (no shuffling)
# This is useful for evaluation to ensure consistency
validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset), # Sequentially sample the validation data
    batch_size=batch_size                   # Number of samples per batch
)

In [18]:
# Load the pre-trained RoBERTa model for sequence classification
# You can replace 'roberta-base' with other models such as 'bert-base-uncased', 'distilbert-base-uncased', or any BERT variant
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",   # Replace with other model names as needed
    num_labels=2,     # Number of labels (binary classification in this case)
    output_attentions=False,  # Whether to output attentions
    output_hidden_states=False, # Whether to output hidden states
)

# Move the model to the specified device (CUDA or CPU)
# Uncomment the following lines if you need to manually move the model to GPU
# if device == "cuda:0":
#     model = model.cuda()

model = model.to(device)  # Move the model to the specified device

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Create the AdamW optimizer for the model
# AdamW is a variant of the Adam optimizer with weight decay
optimizer = AdamW(
    model.parameters(),  # Parameters of the model to optimize
    lr=5e-5,             # Learning rate for the optimizer
    eps=1e-8             # Epsilon value for numerical stability (prevents division by zero)
)



In [20]:
# Define the number of training epochs
# 2 epochs is often sufficient for fine-tuning large models like RoBERTa
# due to their pre-trained knowledge and large capacity
epochs = 2

# Calculate the total number of training steps
# Total steps = number of batches per epoch x number of epochs
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler with a linear warmup
# The scheduler adjusts the learning rate according to the number of training steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,                   # Optimizer to use with the scheduler
    num_warmup_steps=0,         # Number of warmup steps (default value used here)
    num_training_steps=total_steps  # Total number of training steps
)

In [21]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()

    model.eval()
    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.32
  Training epcoh took: 0:14:40

Running Validation...
  Accuracy: 0.90

Training...

  Average training loss: 0.27
  Training epcoh took: 0:14:41

Running Validation...
  Accuracy: 0.90

Training complete!
Total training took 0:30:36 (h:mm:ss)


In [24]:
# Load the trained model
# The file is named 'bert_model', but it actually contains the RoBERTa model or any other model i tested.
# Just a quirky naming choice from experimenting with different models :)
model = torch.load('bert_model')

In [25]:
# Switch the model to evaluation mode
# This tells the model that we are in inference mode, not training mode
# This is important for things like dropout and batch normalization to behave correctly during evaluation
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [26]:
# Load the test dataset from a CSV file
# This file contains the reviews we will use to make predictions
df_test = pd.read_csv('/content/drive/My Drive/Hackthon/test.csv')

In [27]:
# This ensures that all review texts are in the correct format for processing
df_test['reviews'] = df_test['reviews'].astype(str)

In [28]:
# Extract the review texts from the test dataset
# These reviews will be used for making predictions
test_tweets = df_test['reviews'].values

In [29]:
test_input_ids = []
test_attention_masks = []
for tweet in test_tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [30]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

#Preparing Predictions and Creating Submission File

In [31]:
predictions = []

# Set the threshold for binary classification
threshold = 0.5

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
        output = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask)

        logits = output.logits
        logits = logits.detach().cpu().numpy()

        # Apply threshold for binary predictions
        binary_predictions = (logits[:, 1] > threshold).astype(int)

        predictions.extend(list(binary_predictions))

In [32]:
predictions[:5]

[0, 0, 0, 0, 0]

In [33]:
df_test['Label'] = predictions

In [34]:
df_test['Label'] = df_test['Label'].map({0: 'N', 1: 'Y'})

In [35]:
# List of columns to be dropped
columns_to_drop = ['Date', 'review ID', 'reviewer ID', 'product ID', 'rating_Helpful', 'rating_Thanks', 'rating_LoveThis', 'rating_OhNo', 'reviews']

# Drop the specified columns
df_test.drop(columns=columns_to_drop, inplace=True)

In [36]:
old_column_name = 'id'
new_column_name = 'ID'

# Rename the column
df_test.rename(columns={old_column_name: new_column_name}, inplace=True)

In [37]:
df_test.head()

Unnamed: 0,ID,Label
0,0,N
1,1,N
2,2,N
3,3,N
4,4,N


# Saving and Downloading the Test File

In [38]:
from google.colab import files

df_test.to_csv('sub_file.csv', index=False)
# Download the file
files.download('sub_file.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>