In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!pip install pyarrow




In [3]:
df=pd.read_parquet('imdb_processed.parquet')
df.head()


Unnamed: 0,review,sentiment,review_clean_basic,tokens,lemmas
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[reviewers, mentioned, watching, oz, episode, ...","[reviewer, mention, watch, oz, episode, ll, ho..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, s, family, little, boy, jake, thin...","[basically, s, family, little, boy, jake, thin..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, time, money, visuall...","[petter, mattei, s, love, time, money, visuall..."


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   review              50000 non-null  str   
 1   sentiment           50000 non-null  str   
 2   review_clean_basic  50000 non-null  str   
 3   tokens              50000 non-null  object
 4   lemmas              50000 non-null  object
dtypes: object(2), str(3)
memory usage: 124.3+ MB


In [5]:
df.isnull().sum()

review                0
sentiment             0
review_clean_basic    0
tokens                0
lemmas                0
dtype: int64

No missing values found 

In [6]:
# check missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,review,sentiment,review_clean_basic,tokens,lemmas


In [7]:
df.shape
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
df.columns

Index(['review', 'sentiment', 'review_clean_basic', 'tokens', 'lemmas'], dtype='str')

In [9]:
df.head()

Unnamed: 0,review,sentiment,review_clean_basic,tokens,lemmas
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[reviewers, mentioned, watching, oz, episode, ...","[reviewer, mention, watch, oz, episode, ll, ho..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[think, wonderful, way, spend, time, hot, summ..."
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, s, family, little, boy, jake, thin...","[basically, s, family, little, boy, jake, thin..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, time, money, visuall...","[petter, mattei, s, love, time, money, visuall..."


In [10]:
# Convert sentiment to numeric positive as 1 label neg as 0 label
df['sentiment_label']=df['sentiment'].map({'positive':1,'negative':0})

In [11]:
df.head()

Unnamed: 0,review,sentiment,review_clean_basic,tokens,lemmas,sentiment_label
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[reviewers, mentioned, watching, oz, episode, ...","[reviewer, mention, watch, oz, episode, ll, ho...",1
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...",1
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[think, wonderful, way, spend, time, hot, summ...",1
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, s, family, little, boy, jake, thin...","[basically, s, family, little, boy, jake, thin...",0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, time, money, visuall...","[petter, mattei, s, love, time, money, visuall...",1


In [12]:
# Text Preprocessing
import re
def clean_text(text):
    # convert to lowercase
    text=text.lower()
    # remove punctuations,special characters
    text=re.sub(r'[^a-zA-Z\s]', '', text)
    # remove extra spaces 
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['review_clean']=df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,sentiment,review_clean_basic,tokens,lemmas,sentiment_label,review_clean
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[reviewers, mentioned, watching, oz, episode, ...","[reviewer, mention, watch, oz, episode, ll, ho...",1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn...",1,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...","[think, wonderful, way, spend, time, hot, summ...",1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,"[basically, s, family, little, boy, jake, thin...","[basically, s, family, little, boy, jake, thin...",0,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,"[petter, mattei, s, love, time, money, visuall...","[petter, mattei, s, love, time, money, visuall...",1,petter matteis love in the time of money is a ...


In [13]:
from sklearn.model_selection import train_test_split

# dep and indep features
X=df['review_clean']
y=df['sentiment_label']

# train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
X_train.head()

39087    thats what i kept asking myself during the man...
30893    i did not watch the entire movie i could not w...
45278    a touching love story reminiscent of in the mo...
16398    this latterday fulci schlocker is a totally ab...
13653    first of all i firmly believe that norwegian m...
Name: review_clean, dtype: str

In [15]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment_label, Length: 50000, dtype: int64

## Convert Text â†’ Numbers (TF-IDF)

Logistic Regression cannot read raw text.
We need TF-IDF vectorization:

In [16]:
# Convert everything to string
X_train = X_train.astype(str)
X_test = X_test.astype(str)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(max_features=5000)
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

In [24]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1147916 stored elements and shape (10000, 5000)>

In [25]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4580106 stored elements and shape (40000, 5000)>

As it is binary classification so we use Logistic Regression

## Train Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

regression = LogisticRegression(max_iter=1000)
regression.fit(X_train,y_train)

y_pred = regression.predict(X_test)

print('Accuracy :',accuracy_score(y_test,y_pred))
print('Classification Report : ',classification_report(y_test,y_pred))

Accuracy : 0.8925
Classification Report :                precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [19]:
import joblib

joblib.dump(regression, "../models/sentiment_model.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']