# MultiClass Text-Sentiment-Analysis using Distil BERT cased

In [1]:
!pip install transformers[torch] datasets comet_ml tensorboard evaluate --upgrade --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.7/709.7 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta 

## 1. Download and Load the dataset

In [2]:
!kaggle datasets download -d yasserh/twitter-tweets-sentiment-dataset
!kaggle datasets download -d tirendazacademy/fifa-world-cup-2022-tweets
!unzip twitter-tweets-sentiment-dataset.zip
!unzip fifa-world-cup-2022-tweets.zip

Dataset URL: https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset
License(s): CC0-1.0
Downloading twitter-tweets-sentiment-dataset.zip to /content
  0% 0.00/1.23M [00:00<?, ?B/s]
100% 1.23M/1.23M [00:00<00:00, 137MB/s]
Dataset URL: https://www.kaggle.com/datasets/tirendazacademy/fifa-world-cup-2022-tweets
License(s): CC0-1.0
Downloading fifa-world-cup-2022-tweets.zip to /content
  0% 0.00/1.38M [00:00<?, ?B/s]
100% 1.38M/1.38M [00:00<00:00, 145MB/s]
Archive:  twitter-tweets-sentiment-dataset.zip
  inflating: Tweets.csv              
Archive:  fifa-world-cup-2022-tweets.zip
  inflating: fifa_world_cup_2022_tweets.csv  


In [1]:
import pandas as pd
import numpy as np

df_1 = pd.read_csv("Tweets.csv")
df_2 = pd.read_csv("fifa_world_cup_2022_tweets.csv")

df_1.sample(5)

Unnamed: 0,textID,text,selected_text,sentiment
20789,d11bc8ece8,_ebru thank you for the link...very cool...see...,cool..,positive
10332,e3b9593fe8,U-Verse is up and blazing at 25Mbps. I don`t ...,U-Verse is up and blazing at 25Mbps. I don`t ...,neutral
20102,237589b163,I hate when my bf beats da dogs. But I guess t...,I hate,negative
15632,d74c09c180,part 2: social networking??.. there is even r...,part 2: social networking??.. there is even ro...,neutral
14448,ffd9503b5c,starting tm alex and i are doing a whole week ...,starting tm alex and i are doing a whole week ...,neutral


In [2]:
df_1 = df_1.dropna()
df_1.isnull().sum()

Unnamed: 0,0
textID,0
text,0
selected_text,0
sentiment,0


In [3]:
# Rename the columns 'Tweet' to 'text' and 'Sentiment' to 'sentiment'
df_2 = df_2.rename(columns={'Tweet': 'text', 'Sentiment': 'sentiment'})
df_2

Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,text,sentiment
0,0,2022-11-20 23:59:21+00:00,4,Twitter Web App,What are we drinking today @TucanTribe \n@MadB...,neutral
1,1,2022-11-20 23:59:01+00:00,3,Twitter for iPhone,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,2,2022-11-20 23:58:41+00:00,1,Twitter for iPhone,Worth reading while watching #WorldCup2022 htt...,positive
3,3,2022-11-20 23:58:33+00:00,1,Twitter Web App,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,4,2022-11-20 23:58:28+00:00,0,Twitter for Android,"If the BBC cares so much about human rights, h...",negative
...,...,...,...,...,...,...
22519,22519,2022-11-20 00:00:21+00:00,1,Twitter Web App,Here We go World cup 2022 #WorldCup2022,positive
22520,22520,2022-11-20 00:00:03+00:00,0,DenetPro,Anderlecht confirms former Viborg FF's Jesper ...,neutral
22521,22521,2022-11-20 00:00:01+00:00,2,Twitter for iPhone,Great thread to read before the start of #Worl...,positive
22522,22522,2022-11-20 00:00:00+00:00,11,Twitter Web App,Raphinha wants Brazil to be united at the #Wor...,positive


In [4]:
# Selecting only the 'text' and 'sentiment' columns from both DataFrames
df_1_limited = df_1[['text', 'sentiment']]
df_2_limited = df_2[['text', 'sentiment']]

# Concatenating the two DataFrames row-wise
df_combined = pd.concat([df_1_limited, df_2_limited], ignore_index=True)
df = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

df

Unnamed: 0,text,sentiment
0,Just got home from another amazing night,positive
1,: Am getting upset listening to the say now I...,negative
2,You can dislike BTS but don’t lie and just acc...,positive
3,I think I could game for Qatar.\n\nI would say...,neutral
4,"is done painting all the bedroom furniture, I ...",negative
...,...,...
49999,i wish paramore would come to ireland,neutral
50000,I got Serbia in the sweepstake #WorldCup2022 h...,neutral
50001,Vive la France but wouldn’t mind Messi taking ...,positive
50002,http://twitpic.com/4jken - fire and urban at r...,neutral


In [5]:
df['text'][0]

'Just got home from another amazing night'

## 2. Text Pre-Processing

- Cleaning up the text data by removing punctuation, extra spaces, and numbers.
- Transform sentences into individual words. Remove some common words that could be in each sentiment like `worldcup` or `#worldcup`.

In [6]:
import re

# Precompile regular expressions for faster preprocessing
non_word_chars_pattern = re.compile(r"[^\w\s\*]")  # Exclude the '*' symbol
whitespace_pattern = re.compile(r"\s+")
digits_pattern = re.compile(r"\d")
username_pattern = re.compile(r"@\S+")
hashtags_pattern = re.compile(r"#(\w+)")
html_url_pattern = re.compile(r'<.*?>|http\S+')
contractions_pattern = re.compile(r"\b(can't|won't|n't|'re|'s|'d|'ll|'t|'ve|'m)\b")

# Expand common contractions
contractions_dict = {
    "can't": "cannot", "won't": "will not", "n't": "not", "'re": "are", "'s": "is",
    "'d": "would", "'ll": "will", "'t": "not", "'ve": "have", "'m": "am"
}

# Remove substrings of words containing fifa|worldcup|qatar|football
specific_words_pattern = re.compile(r"\b\w*(worldcup|fifa|qatar|football|ecuador)\w*\b", re.IGNORECASE)

def expand_contractions(text):
    return contractions_pattern.sub(lambda x: contractions_dict.get(x.group()), text)

def preprocess_text(text):
    # Remove HTML tags and URLs
    text = html_url_pattern.sub('', text)
    # Lowercase text
    text = text.lower()
    # Expand contractions
    text = expand_contractions(text)
    # Remove specific words
    text = specific_words_pattern.sub('', text)
    # Remove hashtags but retain the word
    text = hashtags_pattern.sub(r'\1', text)
    # Remove usernames
    text = username_pattern.sub('', text)
    # Remove non-word characters except '*' symbol
    text = non_word_chars_pattern.sub(' ', text)
    # Replace whitespaces with a single space
    text = whitespace_pattern.sub(' ', text)
    # Remove digits
    text = digits_pattern.sub('', text)

    return text.strip()

In [7]:
pd.set_option('display.max_colwidth', 200)

# Replaces the null values in the data with an empty string
df = df.where((pd.notnull(df)),'')

# Apply preprocessing function to your text column
df['cleaned_text'] = df['text'].apply(preprocess_text)
df = df[['text', 'cleaned_text', 'sentiment']]
df.sample(10)

Unnamed: 0,text,cleaned_text,sentiment
46133,would like to go back to bed. horrible headache pounding behind my eyes and all over my skull,would like to go back to bed horrible headache pounding behind my eyes and all over my skull,negative
30854,"The #WorldCup starts soon. Enjoy it. It may be weird, it may be mid November, but its the World Cup nonethless\n\n#WorldCup2022 #WorldcupQatar2022",the starts soon enjoy it it may be weird it may be mid november but its the world cup nonethless,positive
11195,So glad my kids are home for Thanksgiving and we get to watch the World Cup together over the break! #worldcup2022 #family https://t.co/OqO8JTiVf8,so glad my kids are home for thanksgiving and we get to watch the world cup together over the break family,positive
18471,i wish my drems could come true,i wish my drems could come true,positive
4560,Omg i`m going to robinson with tyler wfm. And i freakin miss anthony ugh today kinda sucks. Lex<3,omg i m going to robinson with tyler wfm and i freakin miss anthony ugh today kinda sucks lex,negative
26716,@danbullock While My Qatar Gently Weeps #WorldCup2022 #Qatar2022 #QatarEcuador,while my gently weeps,neutral
9271,Going to school =[ I`m actually not so tired today tho,going to school i m actually not so tired today tho,positive
31304,I really wish I could convince Brandon to move somewhere like that. All he wants to do is move to Lexington.,i really wish i could convince brandon to move somewhere like that all he wants to do is move to lexington,positive
20162,dude i am so sorry!!!!! I never got that number for you my fail i just remembered,dude i am so sorry i never got that number for you my fail i just remembered,negative
38203,why am I weird?,why am i weird,negative


In [8]:
# Remove rows where 'cleaned_text' is empty or contains only whitespace
df = df[df['cleaned_text'].str.strip() != '']
df.shape

(50001, 3)

In [9]:
df['cleaned_text'][0], df['sentiment'][0]

('just got home from another amazing night', 'positive')

## 3. Mapping `sentiment` column to numeric values

In [10]:
df['sentiment'] = df['sentiment'].replace({'positive':2, 'neutral': 1, 'negative': 0})
df.head(10)

  df['sentiment'] = df['sentiment'].replace({'positive':2, 'neutral': 1, 'negative': 0})


Unnamed: 0,text,cleaned_text,sentiment
0,Just got home from another amazing night,just got home from another amazing night,2
1,: Am getting upset listening to the say now I want to speak to you Mitchel but my darn phone got no money on x,am getting upset listening to the say now i want to speak to you mitchel but my darn phone got no money on x,0
2,You can dislike BTS but don’t lie and just accept that the best song of this year’s #WorldCup2022 is #DreamersbyJungkook Jungkookie we 💜 you,you can dislike bts but don t lie and just accept that the best song of this year s is dreamersbyjungkook jungkookie we you,2
3,I think I could game for Qatar.\n\nI would say I’m fully expecting Qatar to be awarded a penalty too but they do need to get in the Ecuador box for that to happen. \n\n#WorldCup2022,i think i could game for i would say i m fully expecting to be awarded a penalty too but they do need to get in the box for that to happen,1
4,"is done painting all the bedroom furniture, I still have to do the table but it will wait until after the move. Uggh moving in the heat",is done painting all the bedroom furniture i still have to do the table but it will wait until after the move uggh moving in the heat,0
5,Having one of my bad days....Migraine today. My 1st since my neck surgery about 2 1/2 months,having one of my bad days migraine today my st since my neck surgery about months,0
6,Yes!!!! Go #ECU Should really be 0-2 but VAR my arse!#WorldCup2022,yes go ecu should really be but var my arse,2
7,that sux but mayb 4 the btr u nvr know,that sux but mayb the btr u nvr know,1
8,Hey #YEG !!!!! Anyone goin to the Edmonton Energy game and wanna do some live updates? PLEASE!!!! They dont post live scores,hey yeg anyone goin to the edmonton energy game and wanna do some live updates please they dont post live scores,1
9,The Doha they are not going to show you.... #WorldCup2022\nhttps://t.co/zwZyTKH78W,the doha they are not going to show you,0


## 4. Spliiting datasets into train and test

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'],
                                                    df['sentiment'],
                                                    test_size=0.2,
                                                    random_state=42)

len(X_train), len(X_test)

(40000, 10001)

In [12]:
X_train, X_test, y_train, y_test = list(X_train), list(X_test), list(y_train), list(y_test)
X_train[:2], y_train[:2]

(['hubby needs a vacation thank god we re leaving for myrtle beach in a week',
  'moms everywhere stop what you are doing and get a good night sleep for tomorrow its your childrens turn to pamper you enjoy your day'],
 [2, 2])

## 5. Preparing data using custom dataloader

In [13]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Setting device agnostic code
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [14]:
class data(Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.labels)

## 6. Load PreTrained Distil BERT base cased Model

In [15]:
from huggingface_hub import notebook_login

# Paste hugging face token with write permission enabled and log in
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
import comet_ml
from comet_ml import Experiment

comet_ml.login(project_name="sentiment-analysis-transformer")

In [17]:
model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, model_max_length=256)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## 7. Tokenize and Create Encoded Dataset

In [18]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

# Tokenize with truncation and padding and create dataset from tokenized data
train_encoding = tokenizer(X_train, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

train_dataset = data(train_encoding, y_train)
test_dataset = data(test_encoding, y_test)

## 8. Fine-Tuning Distil BERT Cased

In [19]:
batch_size = 64
epochs = 5

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-sentiment",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=1,                   # adjust if needed for larger batch sizes
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.01,
    num_train_epochs=epochs,                         # specify the number of epochs
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="epoch",                           # Perform evaluation at the end of each epoch
    per_device_eval_batch_size=batch_size,
    save_strategy="epoch",                           # Save model at the end of each epoch
    save_total_limit=1,                              # Only keep the best model (limit to 1 checkpoint)
    logging_strategy="epoch",
    report_to=["comet_ml", "tensorboard"],           # Experiment Tracker: CometML or others
    load_best_model_at_end=True,                     # Load the best model at the end of training
    metric_for_best_model="accuracy",               # Use eval_loss as the metric to track the best model
    greater_is_better=True,                         # Lower eval_loss is better
    push_to_hub=True,                                # Automatically push the best model to Hugging Face Hub
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.0
)

## 9. Train the Fine-Tuned BERT Model

In [20]:
import evaluate
from sklearn.metrics import confusion_matrix

accuracy_metric = evaluate.load("accuracy")

LABELS = ['negative', 'neutral', 'positive']
exp = comet_ml.Experiment()

# Compute_metrics function with confusion matrix logging
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    # Calculate confusion matrix
    cm = confusion_matrix(labels, predictions)

    # Log the confusion matrix to Comet ML
    exp.log_confusion_matrix(matrix=cm, labels=LABELS, file_name="confusion-matrix.json")

    return accuracy



In [21]:
# Label mapping
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Override the model configuration for custom labels
model.config.id2label = label_mapping
model.config.label2id = {v: k for k, v in label_mapping.items()}


trainer = Trainer(
    model=model,                        # The instantiated Transformers model to be trained
    args=training_args,                 # Training arguments, defined above
    train_dataset=train_dataset,        # Training dataset
    eval_dataset=test_dataset,          # Evaluation dataset
    tokenizer=tokenizer,                # Tokenizer
    data_collator=data_collator,        # Data collator
    compute_metrics=compute_metrics,    # Function to compute metrics
    callbacks=[early_stopping_callback] # Early Stop Callback
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [23]:
from accelerate import Accelerator

# Initialize Accelerator and Trainer
Accelerator()
trainer.train()

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7352,0.560801,0.764324
2,0.5016,0.532228,0.780822
3,0.3913,0.550758,0.785221
4,0.3087,0.589546,0.782622
5,0.2502,0.63266,0.780722


TrainOutput(global_step=3125, training_loss=0.4374175439453125, metrics={'train_runtime': 730.8145, 'train_samples_per_second': 273.667, 'train_steps_per_second': 4.276, 'total_flos': 6002536046400000.0, 'train_loss': 0.4374175439453125, 'epoch': 5.0})

In [24]:
exp.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : shiny_expense_7052
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/luluw8071/sentiment-analysis-transformer/cb37a3a42b1e4a8b93a45e4e4059fb5d
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     eval/accuracy [10]             : (0.7643235676432357, 0.7852214778522147)
[1;38;5;39mCOMET INFO:[0m     eval/loss [10]                 : (0.532228410243988, 0.6326600313186646)
[1;38;5;39mCOMET INFO:[0m     eval/runtime [10]              : (7.5709, 8.4288)
[1;38;5;39mCOMET INFO:[0m     eval/sam

In [25]:
trainer.evaluate()

{'eval_loss': 0.550757884979248,
 'eval_accuracy': 0.7852214778522147,
 'eval_runtime': 7.7718,
 'eval_samples_per_second': 1286.826,
 'eval_steps_per_second': 20.201,
 'epoch': 5.0}

In [26]:
kwargs = {
    "dataset": "Twitter Sentiment Datasets",
    # "dataset_args": "config: hi, split: test",
    "language": "en",
    "finetuned_from": model_name,
    "tasks": "multi-sentiment-classification",
}

trainer.push_to_hub(**kwargs)

events.out.tfevents.1732272097.cac2bbe946c1.3456.0:   0%|          | 0.00/8.22k [00:00<?, ?B/s]

events.out.tfevents.1732272847.cac2bbe946c1.3456.1:   0%|          | 0.00/411 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/luluw/distilbert-base-cased-finetuned-sentiment/commit/0ae42f68d9e7c623723ff147c9641e3f3e0b7718', commit_message='End of training', commit_description='', oid='0ae42f68d9e7c623723ff147c9641e3f3e0b7718', pr_url=None, repo_url=RepoUrl('https://huggingface.co/luluw/distilbert-base-cased-finetuned-sentiment', endpoint='https://huggingface.co', repo_type='model', repo_id='luluw/distilbert-base-cased-finetuned-sentiment'), pr_revision=None, pr_num=None)

## 10. Sentiment Prediction using custom text


In [27]:
# Tokenize text, get output from model and predict
def predict_sentiment(model, tokenizer, text, device):
    tokenized = tokenizer(text, truncation=True, padding=True, return_tensors='pt').to(device)
    outputs = model(**tokenized)
    probs = F.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(outputs.logits, dim=-1).item()
    probs_max = probs.max().detach().cpu().numpy()

    prediction = "Positive" if preds == 2 else "Neutral" if preds == 1 else "Negative"
    print(f'{text}\nSentiment: {prediction}\tProbability: {probs_max*100:.2f}%\n', end="-"*50 + "\n")

In [29]:
texts = ["The fitness tracker is sleek, comfortable to wear, and provides accurate step counts. However, the heart rate monitor is unreliable, and syncing with the mobile app is inconsistent.",
         "The gym has state-of-the-art equipment and a wide range of classes. On the other hand, the facilities are often overcrowded during peak hours, making it difficult to get a workout in.",
         "The movie had an intriguing plot and captivating visuals, but the sound quality was poor, making it difficult to fully enjoy the experience."]

for text in texts:
    predict_sentiment(model, tokenizer, text, device)

The fitness tracker is sleek, comfortable to wear, and provides accurate step counts. However, the heart rate monitor is unreliable, and syncing with the mobile app is inconsistent.
Sentiment: Neutral	Probability: 69.90%
--------------------------------------------------
The gym has state-of-the-art equipment and a wide range of classes. On the other hand, the facilities are often overcrowded during peak hours, making it difficult to get a workout in.
Sentiment: Neutral	Probability: 61.56%
--------------------------------------------------
The movie had an intriguing plot and captivating visuals, but the sound quality was poor, making it difficult to fully enjoy the experience.
Sentiment: Negative	Probability: 70.21%
--------------------------------------------------


In [30]:
texts = ["The fitness tracker is sleek, comfortable to wear, and provides accurate step counts.",
         "However, the heart rate monitor is unreliable, and syncing with the mobile app is inconsistent.",
         "The gym has state-of-the-art equipment and a wide range of classes.",
         "On the other hand, the facilities are often overcrowded during peak hours, making it difficult to get a workout in."]

for text in texts:
  predict_sentiment(model, tokenizer, text, device)

The fitness tracker is sleek, comfortable to wear, and provides accurate step counts.
Sentiment: Positive	Probability: 97.29%
--------------------------------------------------
However, the heart rate monitor is unreliable, and syncing with the mobile app is inconsistent.
Sentiment: Negative	Probability: 91.43%
--------------------------------------------------
The gym has state-of-the-art equipment and a wide range of classes.
Sentiment: Positive	Probability: 70.97%
--------------------------------------------------
On the other hand, the facilities are often overcrowded during peak hours, making it difficult to get a workout in.
Sentiment: Negative	Probability: 65.93%
--------------------------------------------------


## Load the fine-tuned model from hugging face

In [31]:
%%writefile inference.py
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_name = "luluw/distilbert-base-cased-finetuned-sentiment"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, max_length=256)

# Example usage
text = "Just finished organizing my desk. Got a few tasks lined up for the afternoon."

inputs = tokenizer(text, return_tensors='pt')
outputs = model(**inputs)
preds = torch.argmax(outputs.logits, dim=-1)
prediction = "Positive" if preds == 2 else "Neutral" if preds == 1 else "Negative"

print(text)
print(prediction)

Writing inference.py


In [34]:
!python3 inference.py

Just finished organizing my desk. Got a few tasks lined up for the afternoon.
Neutral
