Upload kaggle DataSet

In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing IMDB DataSet

In [7]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


Extract Zip DataSet

In [8]:
from zipfile import ZipFile
dataset = '/content/imdb-dataset-of-50k-movie-reviews.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing Dependencies

In [9]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
# Printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Processing

In [12]:
# Loading the dataset to Pandas Dataframe
imdb_dataset = pd.read_csv('/content/IMDB Dataset.csv', encoding='ISO-8859-1')

In [13]:
# checking the Rows and Columns in the DataSet
imdb_dataset.shape

(50000, 2)

In [14]:
# Printing the first 5 rows
imdb_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
# Counting the missing Values in the Dataset
imdb_dataset.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [16]:
# Checking the distribution of sentiment column
imdb_dataset.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


Converting the "positive" sentiment to "1" and "Negative" sentiment to "0"

In [17]:
imdb_dataset.loc[imdb_dataset.sentiment == 'positive', 'sentiment'] = 1
imdb_dataset.loc[imdb_dataset.sentiment == 'negative', 'sentiment'] = 0

In [18]:
# Checking the distribution of sentiment column
imdb_dataset.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


**Stemming-**
Stemming is a process of reducing a word to its root word.

In [19]:
port_stem = PorterStemmer()

In [20]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [21]:
imdb_dataset['review'] = imdb_dataset['review'].apply(stemming)

In [22]:
imdb_dataset.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,1
1,wonder littl product br br film techniqu unass...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [23]:
print(imdb_dataset['review'])

0        one review mention watch oz episod hook right ...
1        wonder littl product br br film techniqu unass...
2        thought wonder way spend time hot summer weeke...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job creativ origin fir...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    go disagre previou comment side maltin one sec...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object


In [24]:
print(imdb_dataset['sentiment'])

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: object


In [25]:
# Seperating the review and sentiment column
rev = imdb_dataset['review'].values
senti = imdb_dataset['sentiment'].values

In [26]:
print(rev)

['one review mention watch oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch d

In [27]:
print(senti)

[1 1 1 ... 0 0 0]


Splitting the data to Training Data and Test Data

In [28]:
rev_train, rev_test, senti_train, senti_test = train_test_split(rev, senti, test_size=0.2, stratify=senti, random_state=2)

In [29]:
print(rev.shape, rev_train.shape, rev_test.shape)

(50000,) (40000,) (10000,)


In [30]:
# Converting Textual data to numerical Data
vectorizer = TfidfVectorizer()
rev_train = vectorizer.fit_transform(rev_train)
rev_test = vectorizer.transform(rev_test)

In [31]:
print(rev_train)

  (0, 3640)	0.03369748581843012
  (0, 37794)	0.03083884095997656
  (0, 23417)	0.04330166343639411
  (0, 25476)	0.042552168732521496
  (0, 18686)	0.047277710206494614
  (0, 16974)	0.027348303334255874
  (0, 56012)	0.052363400207014466
  (0, 19596)	0.05950464045530937
  (0, 52747)	0.056487226805711735
  (0, 26032)	0.040766971865625515
  (0, 42881)	0.06145111662065541
  (0, 20792)	0.06145111662065541
  (0, 18088)	0.042021614137502016
  (0, 728)	0.06392855390164114
  (0, 52074)	0.035773458754654174
  (0, 54476)	0.05496598602654393
  (0, 26330)	0.0998989012416776
  (0, 52130)	0.09734562377585103
  (0, 22801)	0.05576915489324283
  (0, 23266)	0.08059261006839524
  (0, 17657)	0.03572860021220714
  (0, 45184)	0.04941248754709795
  (0, 423)	0.030658245414730823
  (0, 19803)	0.05182623004500427
  (0, 14202)	0.04570262647024532
  :	:
  (39999, 8037)	0.09350315298732075
  (39999, 55194)	0.10976277115457879
  (39999, 6618)	0.2286782931717139
  (39999, 3695)	0.0606538300970193
  (39999, 60138)	0.0994

In [32]:
print(rev_test)

  (0, 61432)	0.09141811373826982
  (0, 61114)	0.1174132492083025
  (0, 59344)	0.10700845344586717
  (0, 58783)	0.12363485820831004
  (0, 56499)	0.18110738431454657
  (0, 55822)	0.10561543140341557
  (0, 54399)	0.3145561958738276
  (0, 53560)	0.2375648353882312
  (0, 52781)	0.13742298480810636
  (0, 51115)	0.12129018296972087
  (0, 48768)	0.2058067727170534
  (0, 48461)	0.11424719075050617
  (0, 48020)	0.11315434122721028
  (0, 43031)	0.2742291272890136
  (0, 42615)	0.12919581710290015
  (0, 42589)	0.10897697675230389
  (0, 42083)	0.11935293034502802
  (0, 40466)	0.13127424030091048
  (0, 39234)	0.044673568627083486
  (0, 38467)	0.12467261061038308
  (0, 36569)	0.08269786505890776
  (0, 36293)	0.13981386734455453
  (0, 35829)	0.2683008477893497
  (0, 34112)	0.0936710873257877
  (0, 32247)	0.11509065713608657
  :	:
  (9999, 19866)	0.10699363512163701
  (9999, 19048)	0.06460733270722972
  (9999, 18264)	0.07490611805058896
  (9999, 17647)	0.061526361534641055
  (9999, 17234)	0.091701987193

Training the Machine Learning Model

Logistic Regerssion

In [33]:
senti_train = senti_train.astype(int)
senti_test = senti_test.astype(int)

In [34]:
model=LogisticRegression(max_iter=1000)

In [35]:
model.fit(rev_train, senti_train)

Model Evaluation

Accuracy Score

In [36]:
# Accuracy Score on the Training Data
rev_train_pred = model.predict(rev_train)
train_data_accuracy = accuracy_score(rev_train_pred, senti_train)
f1_train_data_accuracy = f1_score(rev_train_pred, senti_train)

In [37]:
print('Accuracy Score on Training Data: ', train_data_accuracy)
print('F1 Score on Training Data: ', f1_train_data_accuracy)

Accuracy Score on Training Data:  0.926125
F1 Score on Training Data:  0.9268111455108359


In [38]:
# Accuracy Score on the Test Data
rev_test_pred = model.predict(rev_test)
test_data_accuracy = accuracy_score(rev_test_pred, senti_test)
f1_test_data_accuracy = f1_score(rev_test_pred, senti_test)

In [39]:
print('Accuracy Score on Test Data: ', test_data_accuracy)
print('F1 Score on Test Data: ', f1_test_data_accuracy)

Accuracy Score on Test Data:  0.8885
F1 Score on Test Data:  0.8907826427661867


Model Accuracy = 88.85 %

In [40]:
import pickle

In [41]:
filename = 'IMDB_trained_model1.pkl'
pickle.dump(model, open(filename, 'wb'))

Using the Saved model for Future Predictions

In [None]:
# Loading the model
loaded_model = pickle.load(open('IMDB_trained_model.sav', 'rb'))

In [None]:
newData = rev_test[100]
print(senti_test[100])

prediction = loaded_model.predict(newData)
print(prediction)

if (prediction[0]==0):
  print('The review is Negative')

else:
  print('The review is Positive')

1
[1]
The review is Positive
