In [1]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Loading the datasets:

In [2]:
#Lodaing the training data:
import pandas as pd

# Replace with the actual path to your dataset in Google Drive
file_path = '/content/drive/MyDrive/sentiment-train.csv'
df_train = pd.read_csv(file_path)

# Inspect the dataset
print(df_train.head())
print(df_train.info())


   gold_label                                               text      target
0           1  dear @Microsoft the newOoffice for Mac is grea...  @microsoft
1           0  @Microsoft how about you make a system that do...  @microsoft
2           1  I may be ignorant on this issue but... should ...  @microsoft
3           1  Thanks to @user I just may be switching over t...  @microsoft
4           2  If I make a game as a #windows10 Universal App...  @microsoft
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26632 entries, 0 to 26631
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  26632 non-null  int64 
 1   text        26632 non-null  object
 2   target      26632 non-null  object
dtypes: int64(1), object(2)
memory usage: 624.3+ KB
None


In [3]:
# Validation data
# Replace with the actual path to your dataset in Google Drive
file_path = '/content/drive/MyDrive/sentiment-validation.csv'
df_val = pd.read_csv(file_path)

# Inspect the dataset
print(df_val.head())
print(df_val.info())


   gold_label                                               text  \
0           2  05 Beat it - Michael Jackson - Thriller (25th ...   
1           3  Jay Z joins Instagram with nostalgic tribute t...   
2           2  Michael Jackson: Bad 25th Anniversary Edition ...   
3           3  I liked a @YouTube video {URL} One Direction s...   
4           3  18th anniv of Princess Diana's death. I still ...   

            target  
0  michael jackson  
1  michael jackson  
2  michael jackson  
3  michael jackson  
4  michael jackson  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  4000 non-null   int64 
 1   text        4000 non-null   object
 2   target      4000 non-null   object
dtypes: int64(1), object(2)
memory usage: 93.9+ KB
None


In [4]:
#Test data

# Replace with the actual path to your dataset in Google Drive
file_path = '/content/drive/MyDrive/sentiment-test.csv'
df_test = pd.read_csv(file_path)

# Inspect the dataset
print(df_test.head())
print(df_test.info())

   gold_label                                               text  \
0           2  #ArianaGrande Ari By Ariana Grande 80% Full {U...   
1           3  Ariana Grande KIIS FM Yours Truly CD listening...   
2           3  Ariana Grande White House Easter Egg Roll in W...   
3           3  #CD #Musics Ariana Grande Sweet Like Candy 3.4...   
4           3  SIDE TO SIDE 😘 @user #sidetoside #arianagrande...   

          target  
0  #ArianaGrande  
1  #ArianaGrande  
2  #ArianaGrande  
3  #ArianaGrande  
4  #ArianaGrande  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12379 entries, 0 to 12378
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gold_label  12379 non-null  int64 
 1   text        12379 non-null  object
 2   target      12379 non-null  object
dtypes: int64(1), object(2)
memory usage: 290.3+ KB
None


### Preprocessing the text:

In [5]:
# Verify the label distribution
print(df_train['gold_label'].value_counts())

gold_label
2    11735
3    10984
1     2869
4      819
0      225
Name: count, dtype: int64


In [6]:
print(df_train.isnull().sum())

gold_label    0
text          0
target        0
dtype: int64


In [7]:
import re

# Function to preprocess text for BERT
def preprocess_text_for_bert(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Replace mentions with placeholder [USER]
    text = re.sub(r"@\w+", '[USER]', text)

    # Preserve hashtags, casing, and punctuation

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply the preprocessing function to the text column
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text_for_bert)
df_val['cleaned_text'] = df_val['text'].apply(preprocess_text_for_bert)
df_test['cleaned_text'] = df_test['text'].apply(preprocess_text_for_bert)


# Display a few rows
df_train[['text', 'cleaned_text']].head()


Unnamed: 0,text,cleaned_text
0,dear @Microsoft the newOoffice for Mac is grea...,dear [USER] the newOoffice for Mac is great an...
1,@Microsoft how about you make a system that do...,[USER] how about you make a system that doesn'...
2,I may be ignorant on this issue but... should ...,I may be ignorant on this issue but... should ...
3,Thanks to @user I just may be switching over t...,Thanks to [USER] I just may be switching over ...
4,If I make a game as a #windows10 Universal App...,If I make a game as a #windows10 Universal App...


In [8]:
# Drop unnecessary columns from all datasets
df_train = df_train.drop(columns=['text', 'target'])
df_val = df_val.drop(columns=['text', 'target'])
df_test = df_test.drop(columns=['text', 'target'])

# Verify the preprocessed datasets
print(df_train.head())

   gold_label                                       cleaned_text
0           1  dear [USER] the newOoffice for Mac is great an...
1           0  [USER] how about you make a system that doesn'...
2           1  I may be ignorant on this issue but... should ...
3           1  Thanks to [USER] I just may be switching over ...
4           2  If I make a game as a #windows10 Universal App...


In [9]:
print(df_train.columns)
print(df_val.columns)
print(df_test.columns)

Index(['gold_label', 'cleaned_text'], dtype='object')
Index(['gold_label', 'cleaned_text'], dtype='object')
Index(['gold_label', 'cleaned_text'], dtype='object')


### Tokenization:

In [10]:
from transformers import DistilBertTokenizer, DataCollatorWithPadding

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function (without padding)
def tokenize_function(text):
    return tokenizer(
        text,
        truncation=True,       # Truncate sequences longer than max_length
        max_length=128,        # Optional max_length cap
    )

# Apply tokenization
tokenized_data = df_train['cleaned_text'].apply(tokenize_function)
tokenized_val = df_val['cleaned_text'].apply(tokenize_function)
tokenized_test = df_test['cleaned_text'].apply(tokenize_function)

# View the first tokenized result (no padding applied yet)
print(tokenized_data.iloc[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

{'input_ids': [101, 6203, 1031, 5310, 1033, 1996, 2047, 21511, 8873, 3401, 2005, 6097, 2003, 2307, 1998, 2035, 1010, 2021, 2053, 1048, 6038, 2278, 10651, 1029, 1039, 1005, 12256, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Initialising the model:

In [11]:
from transformers import DistilBertForSequenceClassification

# Load the DistilBERT model with a classification head
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",  # Pre-trained DistilBERT
    num_labels=5               # Number of sentiment labels (0-4)
)

# Check model architecture
print(model)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Weighted loss function and optimizer:

In [12]:
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
from torch.optim import AdamW  # Import AdamW from PyTorch


# Calculate class weights based on the training data
class_weights = compute_class_weight(
    class_weight="balanced",  # Balances based on class frequency
    classes=np.array([0, 1, 2, 3, 4]),  # Sentiment classes
    y= df_train['gold_label']  # Replace with your training labels (actual numbers)
)

# Convert weights to PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define weighted loss function
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Example: Print class weights to confirm
print(f"Class Weights: {class_weights}")


# Replace Hugging Face's AdamW with PyTorch's AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

Class Weights: tensor([23.6729,  1.8565,  0.4539,  0.4849,  6.5035])


Specifying the settings of the model : (Define training arguements)

In [13]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate after each epoch
    save_strategy="epoch",          # Save model after each epoch
    logging_dir="./logs",           # Directory for training logs
    learning_rate=5e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size per device
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    logging_steps=10,               # Log training metrics every 10 steps
    save_total_limit=2,             # Keep only 2 latest checkpoints
    load_best_model_at_end=True,    # Automatically load the best model
    metric_for_best_model="accuracy", # Metric to determine the best model
    report_to="none"                # Avoid reporting to any third-party service
)

# Print the configuration to confirm
print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_



### Preparing the training dataset to train using the trainer function:

In [14]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [15]:
from datasets import Dataset

def create_dataset(tokenized_data, labels):
  input_ids = [x['input_ids'] for x in tokenized_data]
  attention_mask = [x['attention_mask'] for x in tokenized_data]
  return Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    })

# Use this function for all datasets
train_data = create_dataset(tokenized_data, df_train['gold_label'].tolist())
val_data = create_dataset(tokenized_val, df_val['gold_label'].tolist())
test_data = create_dataset(tokenized_test, df_test['gold_label'].tolist())

## Training:

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer
from transformers import DataCollatorWithPadding
# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute_metrics function
def compute_metrics(pred):
    predictions, labels = pred
    predictions = predictions.argmax(axis=1)  # Get the class with highest probability
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Update the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    #tokenizer=tokenizer,
    data_collator=data_collator,  # Enables dynamic padding
    compute_metrics=compute_metrics  # Add the metrics function here
)

# Start training
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6801,1.035549,0.557,0.539365,0.557,0.534405
2,0.6277,1.190973,0.546,0.550783,0.546,0.534998
3,0.3457,1.521673,0.53625,0.541935,0.53625,0.526922


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=4995, training_loss=0.6012527651018328, metrics={'train_runtime': 430.8347, 'train_samples_per_second': 185.445, 'train_steps_per_second': 11.594, 'total_flos': 880727639781600.0, 'train_loss': 0.6012527651018328, 'epoch': 3.0})

In [17]:
  # Evaluate the model on the test dataset
val_results = trainer.evaluate(val_data)

# Print the Vailidation results
print("Validation Set Results:")
for key, value in val_results.items():
    print(f"{key}: {value}")


Validation Set Results:
eval_loss: 1.0355490446090698
eval_accuracy: 0.557
eval_precision: 0.5393649223948135
eval_recall: 0.557
eval_f1: 0.5344050808322578
eval_runtime: 5.0265
eval_samples_per_second: 795.782
eval_steps_per_second: 49.736
epoch: 3.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
  # Evaluate the model on the test dataset
test_results = trainer.evaluate(test_data)

# Print the test results
print("Test Set Results:")
for key, value in test_results.items():
    print(f"{key}: {value}")


Test Set Results:
eval_loss: 0.8832032084465027
eval_accuracy: 0.5931012198077389
eval_precision: 0.5916389531740106
eval_recall: 0.5931012198077389
eval_f1: 0.5867535011946092
eval_runtime: 15.8597
eval_samples_per_second: 780.531
eval_steps_per_second: 48.803
epoch: 3.0


Saving the model:

In [19]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_distilbert")
tokenizer.save_pretrained("./fine_tuned_distilbert")

print("Model and tokenizer saved successfully!")

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

'''# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilbert")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilbert")

print("Model and tokenizer loaded successfully!") '''


Model and tokenizer saved successfully!


'# Load the model and tokenizer\nmodel = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilbert")\ntokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilbert")\n\nprint("Model and tokenizer loaded successfully!") '