In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#Load dataset
file_path = "C:/Users/User/UKM - Nur Azlin Binti Rusnan/Sem 3/Capstone Project/Dataset/1. Sentiment Analysis Dataset/10-12/fine-tuned-no-neutral_1113.xlsx"  
data = pd.read_excel(file_path)

# Map sentiment labels to integers
label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
data['Label'] = data['Sentiment'].map(label_mapping)

# Drop rows with missing or duplicate comments
data = data.dropna(subset=['Comment', 'Label']).drop_duplicates(subset=['Comment'])

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Comment'].tolist(),
    data['Label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Check data
print(data.head())

                                             Comment Sentiment  Label
0  Thanks for punctual and quick service resolvin...  Positive      2
1                                    Prompt response  Positive      2
2                                     problem solved  Positive      2
3                                      problem fixed  Positive      2
4                                kudos to lisa reed!  Positive      2


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "nlptown/bert-base-multilingual-uncased-sentiment",
    num_labels=3,  # For Positive, Negative, Neutral
    ignore_mismatched_sizes=True
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import torch

# Define dataset class for processing data
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)


In [4]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save model after every epoch
    logging_dir='./logs',  # Directory for logging
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable reporting to external services like W&B
    load_best_model_at_end=True,
    weight_decay=0.01,  # Regularization to prevent overfitting
    learning_rate=2e-5  # Learning rate for optimization
)




In [5]:
from transformers import Trainer

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)




In [6]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6028,0.321003
2,0.2549,0.196247
3,0.1217,0.148258
4,0.0638,0.158194


TrainOutput(global_step=44, training_loss=0.24428633871403607, metrics={'train_runtime': 140.1257, 'train_samples_per_second': 4.967, 'train_steps_per_second': 0.314, 'total_flos': 45781734684672.0, 'train_loss': 0.24428633871403607, 'epoch': 4.0})

In [7]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Get predictions
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Print classification report
print(classification_report(val_labels, preds, target_names=['Negative','Positive']))

# Calculate overall accuracy
accuracy = accuracy_score(val_labels, preds)
print(f"Validation Accuracy: {accuracy:.2f}")

              precision    recall  f1-score   support

    Negative       1.00      0.96      0.98        25
    Positive       0.95      1.00      0.97        19

    accuracy                           0.98        44
   macro avg       0.97      0.98      0.98        44
weighted avg       0.98      0.98      0.98        44

Validation Accuracy: 0.98


In [8]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.txt',
 './fine_tuned_model\\added_tokens.json')

In [9]:
# test the fine-tuned model

from transformers import pipeline

fine_tuned_model = BertForSequenceClassification.from_pretrained('./fine_tuned_model')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine_tuned_model')

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Test with new comments
test_comments = ["no comment"]
results = sentiment_analyzer(test_comments)
print(results)


Device set to use cpu


[{'label': 'LABEL_2', 'score': 0.8158328533172607}]


In [10]:
# Load the real dataset for testing
test_file_path = "C:/Users/User/UKM - Nur Azlin Binti Rusnan/Sem 3/Capstone Project/Dataset/2. Predictive Modeling Dataset/Predictive_Modeling_cdataset.xlsx"
test_data = pd.read_excel(test_file_path)

# Ensure the dataset has a column for comments
test_comments = test_data['USS Comment'].dropna().tolist()

# Perform predictions on the test dataset
results = sentiment_analyzer(test_comments)

# Add results to the original dataset
test_data['Predicted Sentiment'] = [res['label'] for res in results]
test_data['Confidence'] = [res['score'] for res in results]

# Save the modified dataset locally
output_file_path = "C:/Users/User/UKM - Nur Azlin Binti Rusnan/Sem 3/Capstone Project/Dataset/2. Predictive Modeling Dataset/pm_test4.xlsx"
test_data.to_excel(output_file_path, index=False)

print(f"File saved at {output_file_path}")

File saved at C:/Users/User/UKM - Nur Azlin Binti Rusnan/Sem 3/Capstone Project/Dataset/2. Predictive Modeling Dataset/pm_test4.xlsx


In [5]:
import keras
print(keras.__version__)

3.7.0


In [6]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.19,>=2.18 (from tf-keras)
  Downloading tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Downloading tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl.metadata (4.9 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-intel==2.18.0->tensorflow<2.19,>=2.18->tf-keras)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow-intel==2.18.0->tensorflow<2.19,>=2.18->tf-keras)
  Downloading keras-3.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow-intel==2.18.0->tensorflow<2.19,>=2.18->tf-keras)
  Downloading ml_dtypes-0.4.1-cp311-cp311-win_amd64.whl.metadata (20 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   -- --------

  You can safely remove it manually.
  You can safely remove it manually.


In [1]:
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install pandas



In [14]:
!pip install --upgrade accelerate
!pip install --upgrade transformers[torch]

Collecting transformers[torch]
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.5 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.5 kB 217.9 kB/s eta 0:00:01
     ----------------------------------- -- 41.0/43.5 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 43.5/43.5 kB 266.4 kB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   - -------------------------------------- 0.1/2.4 MB 2.0 MB/s eta 0:00:02
   ------- -------------------------------- 0.4/2.4 MB 5.3 MB/s eta 0:00:01
   ------------------ --------------------- 1.1/2.4 MB 9.8 MB/s eta 0:00:01
   -------------------------- ------------- 1.6/2.4 MB 10.2 MB/s eta 0:00:01
   --------------

  You can safely remove it manually.
