In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


# Data Acquisition

In [125]:
df = pd.read_csv("IMDB Dataset.csv",) # loading the IMDB Dataset

In [126]:
df.head() # showing top 5 rows uisng df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [127]:
df.shape 

(50000, 2)

##### The dataset has 2000 rows and two columns and the column names are review and sentiment. 
* sentiments are etiher positive or negative

# Data Processing and Cleaning

In [128]:
def clean_text(review:str):
    text = BeautifulSoup(review, "html.parser")
    return text.get_text()

df['clean_text'] = df['review'].apply(clean_text) 


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [129]:
print(df.shape)
df.dropna(inplace=True) # drop if any NONE values are there in the df

(50000, 3)


In [130]:
condition = df['clean_text'].apply(lambda x: len(x.strip())) > 512
## Drop any review which has a len greater that 512 as the model token size in 512
df = df[~condition]
print("\nDataFrame after dropping rows:")
df.head()


DataFrame after dropping rows:


Unnamed: 0,review,sentiment,clean_text
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...
14,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...
22,"What an absolutely stunning movie, if you have...",positive,"What an absolutely stunning movie, if you have..."
36,The plot is about the death of little children...,negative,The plot is about the death of little children...
46,Protocol is an implausible movie whose only sa...,negative,Protocol is an implausible movie whose only sa...


In [None]:
for idx, row in df.iterrows():
    print(len(row['clean_text']))
    # pred(row['clean_text'])[1] 

##### class distribution of the dataset

In [131]:
df['sentiment'].value_counts()

sentiment
positive    2929
negative    2436
Name: count, dtype: int64

In [132]:
fig = px.bar(df, x='sentiment', color='sentiment',
              title="Bar chart of postive and negative sentiments", 
              width=500)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

` we can see here that there are equal num of postive and negative review sentiments available in the dataset`

#### visualizing distribution using pie chart

In [133]:
from collections import Counter
counter = Counter(df['sentiment'])
counter

Counter({'positive': 2929, 'negative': 2436})

In [134]:
data_list = [{'sentiment': key, 'count': value} for key, value in counter.items()]
fig = px.pie(data_list, values='count', names='sentiment', title='Sentiment Pie Chart')
fig.show()

In [135]:
# Adding a binary value to each calss to use accuracy metric
df['class_int_value'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
actual_y = df['class_int_value'].tolist()
actual_y[:3]

[1, 1, 1]

####  Sentiment Analysis Implementation

In [12]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") # Loading the bert model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") # loading the tokenizer


tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 2.83kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.05MB/s]
config.json: 100%|██████████| 629/629 [00:00<00:00, 216kB/s]
model.safetensors: 100%|██████████| 268M/268M [03:02<00:00, 1.47MB/s] 


'POSITIVE'

### Prediction of setimentns From the clean_text column

In [143]:
##predict the sentiments 
def pred(text):
    try:
        inputs = tokenizer(text, return_tensors="pt")  # Tokenize the input text using the specified tokenizer

        # Run the model in inference mode
        with torch.inference_mode():
            logits = model(**inputs).logits # Pass the tokenized input to the model and obtain logits

        predicted_class_id = logits.argmax().item() # Find the index of the class with the highest probability

        pred_sentiment = model.config.id2label[predicted_class_id].lower() # Map the predicted class index to the corresponding label

        return (pred_sentiment, 1 if pred_sentiment == 'positive' else 0)

    except Exception as e:
        pass
        print(f"An error occurred: {e}")


In [137]:
pred_y = []
for idx, row in df.iterrows():
    pred_y.append(pred(row['clean_text'])[1])

In [142]:
from sklearn.metrics import classification_report
from pprint import pprint

pprint(classification_report(actual_y, pred_y))


('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.90      0.89      0.90      2436\n'
 '           1       0.91      0.92      0.92      2929\n'
 '\n'
 '    accuracy                           0.91      5365\n'
 '   macro avg       0.91      0.91      0.91      5365\n'
 'weighted avg       0.91      0.91      0.91      5365\n')


` The classification report provides an evaluation of a binary classification model's performance. With an accuracy of 91%, the model demonstrates high precision and recall for both classes (0 and 1), indicating effective predictions. The weighted average F1-score of 0.91 reflects the model's balanced performance across classes, yielding robust overall predictions on a dataset of 5365 instances. `

# Results Visualization

In [157]:
pre_counter = dict(Counter(pred_y))
print(pre_counter)
pre_counter['positve'] = pre_counter[1]
pre_counter['negative'] = pre_counter[0]
del pre_counter[0]
del pre_counter[1]
data_list = [{'sentiment': key, 'count': value} for key, value in pre_counter.items()]
fig = px.pie(data_list, values='count', names='sentiment', title='prediction Sentiment Pie Chart')
fig.show()


{1: 2960, 0: 2405}


#### In the prediction chart we can see that the postive  prediction has fallen down to 45% to 44% than the actual pie chart

# Make prediction

In [145]:
input_text = "what is photosynthesis"
print(pred(input_text)[0])

positive


# Write-up
` In the project we first import the necessary libraries including pandas, numpy. plotly, beautufulsoup and`  ` sentence transformers. We then import the IMDB dataset in pandas which contains 50000 movie reviews and the the sentiment of the reviews. The reviews are here input variable and the sentiment is a binary target or output class positive and negative. we then applied the necessary preprocessing steps followed:`
* Remove html tags from the review texts using beautifulsoup 
* Remove any None contain rows
* Remove rows where the text length is more than 512 as our model maximum input token size is 512

`After applying the preprocessing steps we visualized the data using ploty to see the distribution of the classes and we found out there are around 54% pos class and 45% negative class is present. After that we used the dataset to predict hwo our model is doing. I used here a pretrained huggingface model which is distilbert-base-uncased-finetuned-sst-2-english using the transformer library from huggingface and predict the texts using the torch.inference_mode(). After that I stored the prediction result in a list named pred_y to plot the classification report using sklearn library.I got 91% accurary using this model and after that I ploted the prediction result in a pie chart and I got that the model preformed quiet well.`