# Setup

In [1]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn

Collecting transformers
  Downloading transformers-4.44.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.1-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.1
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
 

In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install beautifulsoup4
!pip install textblob
!pip install mlxtend


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-5igip6mx
  Running command git clone --filter=blob:none --quiet https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-5igip6mx
  Resolved https://github.com/laxmime

In [4]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# Data Loading and Preprocessing


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [8]:
df = pd.read_excel("/content/drive/MyDrive/BERT_Sentiment_Analysis/Labeled_Data.xlsx")
df.head()

Unnamed: 0,Reviewee Name,Rating,Question,Review,Review Date,label
0,Chloe B,5,Brilliant service!,"The removal men were quick, polite and very fl...",2 days ago,1
1,FL,5,Great before/during/after service,What a pleasant overall experience. Thank you ...,3 days ago,1
2,Adetokunbo Jolaoso,5,Superb one-off cleaning,Archie (apologies if spelt incorrectly) was ju...,2 days ago,1
3,Haynes,5,Fantastic Service indeed,I recently hired Fantastic Services to tame my...,4 days ago,1
4,Ms Alison Turner,5,Great job from the handyman!,The handyman from Fantastic Services was terri...,5 days ago,1


In [9]:
df.drop(['Reviewee Name' , 'Rating' , 'Question' , 'Review Date'] , axis = 1,inplace = True)

In [10]:
df.head()

Unnamed: 0,Review,label
0,"The removal men were quick, polite and very fl...",1
1,What a pleasant overall experience. Thank you ...,1
2,Archie (apologies if spelt incorrectly) was ju...,1
3,I recently hired Fantastic Services to tame my...,1
4,The handyman from Fantastic Services was terri...,1


In [12]:
df.isnull().sum()

Unnamed: 0,0
Review,3040
label,0


In [13]:
df = df.dropna(subset=['Review'])

In [14]:
df.isnull().sum()

Unnamed: 0,0
Review,0
label,0


In [15]:
df.shape

(28653, 2)

# Data Preparation

In [16]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [28]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length",
                              max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }


In [29]:
# prepare tokenizer and model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased'
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
df = df.reset_index(drop=True)
X = df['Review'].tolist()

y = df['label']

dataset = CustomDataset(X, y, tokenizer)


In [31]:
dataset[0].keys()


dict_keys(['input_ids', 'attention_mask', 'labels'])

In [32]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

In [37]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(example):
  labels = example.label_ids
  preds = example.predictions.argmax(-1)

  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)

  return {'accuracy': acc, "f1": f1}

# Training

In [38]:
from transformers import Trainer, TrainingArguments
batch_size = 16
model_name = "distilbert_finetuned_setiment"

args = TrainingArguments(
    output_dir = "output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = 2e-5,
    num_train_epochs = 2,
    evaluation_strategy = 'epoch'
)



In [39]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics=compute_metrics,
                  tokenizer = tokenizer)

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0393,0.111632,0.974525,0.974491
2,0.0254,0.128866,0.97592,0.975768


TrainOutput(global_step=2866, training_loss=0.030872125186374447, metrics={'train_runtime': 2408.7397, 'train_samples_per_second': 19.032, 'train_steps_per_second': 1.19, 'total_flos': 6072835423985664.0, 'train_loss': 0.030872125186374447, 'epoch': 2.0})

In [41]:
# Define the directory where you want to save the model
save_directory = '/content/drive/MyDrive/BERT_Sentiment_Analysis'

# Save the model to the specified directory
trainer.save_model(save_directory)
