<a href="https://colab.research.google.com/github/AzlinRusnan/Optimizing_CSAT_Through_Sentiment-Analysis_and_Predictive-Modeling/blob/main/9_12_Fine_tuning_a_pre_trained_BERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn
!pip install transformers datasets torch scikit-learn pandas openpyxl

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Import libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

In [29]:
# Load data
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from transformers import BertConfig, BertForSequenceClassification

file_path = '/content/gdrive/MyDrive/Capstone Project/fine-tuned-no-neutral.xlsx'
data = pd.read_excel(file_path)

# Map sentiment labels to integers
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data['Label'] = data['Sentiment'].map(label_mapping)

# Drop rows with missing or duplicate comments
data = data.dropna(subset=['Comment', 'Label']).drop_duplicates(subset=['Comment'])

# Check data
print(data.head())

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Comment'].tolist(),
    data['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Define dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

model = BertForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment",
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disables W&B
    load_best_model_at_end=True
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

                                             Comment Sentiment  Label
0  Thanks for punctual and quick service resolvin...  Positive      2
1                                    Prompt response  Positive      2
2                                      Quick support  Positive      2
3  Andre provided excellent support He used Teams...  Positive      2
4  Assistance was immediate and resolved my issue...  Positive      2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7568,0.257832
2,0.2325,0.095448
3,0.1795,0.070842
4,0.0315,0.05828


TrainOutput(global_step=44, training_loss=0.27794994989579375, metrics={'train_runtime': 111.5535, 'train_samples_per_second': 5.916, 'train_steps_per_second': 0.394, 'total_flos': 43413713925120.0, 'train_loss': 0.27794994989579375, 'epoch': 4.0})

In [30]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json')

In [31]:
#evaluate the model on the validation set

from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Get predictions and labels
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Print classification report
print(classification_report(labels, preds, target_names=['Negative','Positive']))

# Calculate overall accuracy
accuracy = accuracy_score(labels, preds)
print(f"Validation Accuracy: {accuracy:.2f}")

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        22
    Positive       1.00      1.00      1.00        20

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

Validation Accuracy: 1.00


In [38]:
# test the fine-tuned model

from transformers import pipeline

fine_tuned_model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Test with new comments
test_comments = ["Prompt efficient and friendly support – much appreciated 😊"]
results = sentiment_analyzer(test_comments)
print(results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_2', 'score': 0.9868602156639099}]


##### **Download the fine-tuned model**

In [None]:
import os

# Check if the folder exists
print(os.path.exists('./fine_tuned_model'))

# List files in the folder
print(os.listdir('./fine_tuned_model'))

True
['model.safetensors', 'config.json', 'vocab.txt', 'tokenizer_config.json', 'special_tokens_map.json']


In [None]:
!zip -r fine_tuned_model.zip ./fine_tuned_model

  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/model.safetensors (deflated 7%)
  adding: fine_tuned_model/config.json (deflated 54%)
  adding: fine_tuned_model/vocab.txt (deflated 48%)
  adding: fine_tuned_model/tokenizer_config.json (deflated 76%)
  adding: fine_tuned_model/special_tokens_map.json (deflated 42%)


In [None]:
from google.colab import files
files.download('fine_tuned_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##### **Test on Real Dataset**

In [28]:
# Load the real dataset for testing
test_file_path = '/content/gdrive/MyDrive/Capstone Project/Sentiment Analysis_cleaned_dataset_4.0_Sept.xlsx'
test_data = pd.read_excel(test_file_path)

# Ensure the dataset has a column for comments
test_comments = test_data['USS Comment'].dropna().tolist()

# Perform predictions on the test dataset
results = sentiment_analyzer(test_comments)

# Add results to the original dataset
test_data['Predicted Sentiment'] = [res['label'] for res in results]
test_data['Confidence'] = [res['score'] for res in results]

test_data.to_excel('CSAT_SASEPT_DATASET.xlsx', index=False)

from google.colab import files
files.download('CSAT_SASEPT_DATASET.xlsx')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>