<a href="https://colab.research.google.com/github/CurioLytics/Emotional-Analysis/blob/main/fine-tuning_pretrained-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn



In [None]:
# Install necessary libraries
!pip install transformers datasets

# Import required modules
import torch
import random
import numpy as np
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score

# Set random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

## Load Dataset


In [None]:
# prompt: df=readdataset emiton_data_prop.csv, the file has header

import pandas as pd
df = pd.read_csv('emotion_data_prep.csv')
df.columns = ['text', 'emotion']

## preprocess for finetuining


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define a function to apply tokenization, with input type check
def tokenize_text(text):
    # Check if the input is a string, list of strings, or list of lists of strings
    if isinstance(text, (str, list)):
        # Tokenize the text and return the dictionary of tokenized output
        return tokenizer(text, truncation=True, padding='max_length', max_length=128)
    else:
        # If the input is not of the expected type, raise a TypeError with a helpful message
        raise TypeError("Input to tokenize_text must be a string, a list of strings, or a list of lists of strings.")

# Apply the tokenizer to the 'text' column, handling potential TypeErrors
tokenized_data = df['text'].apply(lambda x: tokenize_text(x) if isinstance(x, (str, list)) else None)

# Filter out any None values resulting from invalid input types
tokenized_data = tokenized_data.dropna()

# Convert tokenized data back into a DataFrame
# You can extract 'input_ids', 'attention_mask', etc.
tokenized_df = pd.DataFrame(tokenized_data.tolist())

# Combine with the original dataframe (if needed)
tokenized_df = pd.concat([df, tokenized_df], axis=1)

# Inspect the resulting tokenized DataFrame
print(tokenized_df.head())

                                                text  emotion  \
0                                ok sound goood hehe        3   
1  egg holder make ash wood shop link egg holder ...        3   
2  buy book review help get amazon new release su...        4   
3               eeek come im soo excite see thursday        1   
4                      hate broken wrist pe next ugh        4   

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...   

                                           input_ids  
0  [101, 7929, 2614, 27571, 7716, 2002, 5369, 102...  
1  [101, 8288, 9111, 2191, 6683, 3536, 4497, 4957...  
2  [101, 4965, 2338, 3319, 2393, 2131, 9733, 2047...  
3  [101, 25212, 5937, 2272, 10047, 17111, 4654, 1...

In [None]:
print(tokenized_df.columns)


Index(['text', 'emotion', 'attention_mask', 'input_ids'], dtype='object')


## Fine-tuning DistilBERT model

In [None]:
# set up dataset
from datasets import Dataset

# Assuming 'df' is your DataFrame with columns ['text', 'emotion', 'attention_mask', 'input_ids']

# Filter out rows with None values in 'input_ids' or 'attention_mask'
filtered_df = tokenized_df.dropna(subset=['input_ids', 'attention_mask'])

# Now create the Hugging Face Dataset using the filtered DataFrame
dataset = Dataset.from_pandas(filtered_df)
# If you don't have a test set, you can split the data into train and validation sets
# Split the dataset into training and validation (80% train, 20% validation)
train_dataset = dataset.shuffle(seed=42).select([i for i in range(int(0.8 * len(dataset)))])
val_dataset = dataset.shuffle(seed=42).select([i for i in range(int(0.8 * len(dataset)), len(dataset))])

# Convert DataFrame to Hugging Face Dataset
def check_for_none(dataset, column_name):
    none_count = dataset.filter(lambda example: example[column_name] is None).num_rows
    if none_count > 0:
        print(f"Found {none_count} examples with None values in column '{column_name}'")
    return none_count

none_count_input_ids_train = check_for_none(train_dataset, 'input_ids')
none_count_attention_mask_train = check_for_none(train_dataset, 'attention_mask')
none_count_input_ids_val = check_for_none(val_dataset, 'input_ids')
none_count_attention_mask_val = check_for_none(val_dataset, 'attention_mask')



# Check the first few rows of the dataset
print(train_dataset[0])





# define training argument
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',              # output directory where the model checkpoints will be saved
    num_train_epochs=3,                  # number of training epochs
    per_device_train_batch_size=8,       # batch size per device during training
    per_device_eval_batch_size=8,        # batch size for evaluation
    warmup_steps=500,                    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # strength of weight decay
    logging_dir='./logs',                # directory for storing logs
    logging_steps=10,                    # log every 10 steps
    evaluation_strategy="epoch",         # evaluate every epoch
    save_strategy="epoch",               # save model checkpoint every epoch
    load_best_model_at_end=True,         # load the best model when finished training
)


# inittialize trainer


# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)  # Adjust num_labels as per your task


# Check the column names in the dataset
print(f"Dataset Columns: {train_dataset.column_names}")
# Rename the 'emotion' column to 'labels'
train_dataset = train_dataset.rename_column("emotion", "labels")
val_dataset = val_dataset.rename_column("emotion", "labels")

# Check the updated columns
print(f"Updated Dataset Columns: {train_dataset.column_names}")
# Remove unused columns that are not part of the model's signature
train_dataset = train_dataset.remove_columns(["text","__index_level_0__"])
val_dataset = val_dataset.remove_columns(["text","__index_level_0__"])

# Check the columns again
print(f"Columns after removal: {train_dataset.column_names}")

# Initialize the Trainer
trainer = Trainer(
    model=model,                             # The model to be fine-tuned
    args=training_args,                      # The training arguments
    train_dataset=train_dataset,             # The training dataset
    eval_dataset=val_dataset,                # The validation dataset
    tokenizer=tokenizer,                     # The tokenizer (for padding, truncating, etc.)
    # Define custom metrics (if any)
    # compute_metrics=compute_metrics,       # Optional: Define a function for evaluation metrics (accuracy, F1, etc.)
)



# start fine-tuninig
# Start training (fine-tuning)
trainer.train()

# Optionally save the model after training
trainer.save_model()  # Save the fine-tuned model

# save model

Epoch,Training Loss,Validation Loss


## Evaluate model

In [None]:
# Evaluate the model using Trainer
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)




# Computing additional metrics
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to predicted class labels

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}




# add comput_metics funtion to Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add this function to compute additional metrics
)



#Run evaluaation again
eval_results = trainer.evaluate()
print(eval_results)


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

## Generate Prediction

In [None]:
# funtion to predict
import torch
import torch.nn.functional as F

# Define a mapping from label numbers to emotion categories
label_map = {
    0: "Happy",
    1: "Sad",
    2: "Angry",
    3: "Excited",
    4: "Neutral",
    5: "Surprise"
}

def predict_emotion(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Perform inference
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        logits = model(**inputs).logits  # Get model output (logits)

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=-1)[0]  # Convert logits to probabilities

    # Get the predicted class
    predicted_class = torch.argmax(probabilities, dim=-1).item()

    # Get confidence score of the predicted class
    confidence_score = probabilities[predicted_class].item() * 100  # Convert to percentage

    # Convert numeric prediction to human-readable label
    predicted_emotion = label_map.get(predicted_class, "Unknown Emotion")

    # Store all class probabilities in a dictionary
    class_probabilities = {label_map[i]: round(probabilities[i].item() * 100, 2) for i in range(len(label_map))}

    return predicted_emotion, confidence_score, class_probabilities



sample_text = "I am so excited about this opportunity!"
predicted_label = predict_emotion(sample_text)
print(f"Predicted Emotion: {predicted_label}")



OSError: Incorrect path_or_model_id: './fine_tuned_distilbert_goemotions'. Please provide either the path to a local folder or the repo_id of a model on the Hub.