In [115]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import joblib

In [47]:
df = pd.read_csv('IMDB Dataset.csv')

In [48]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [49]:
df.shape

(50000, 2)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [51]:
# As can be seen, the text contain HTML tags, we need to remove them

df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [52]:
def remove_html(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [53]:
def remove_newline(text):
    new = re.compile('\n')
    return new.sub(r'', text)

In [54]:
df['review'] = df['review'].apply(remove_newline)

In [55]:
remove_html(df['review'][0])

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [56]:
df['review'] = df['review'].apply(remove_html)

In [57]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [58]:
# Now, let's lowercase it

df['review'] = df['review'].apply(lambda x: x.lower())

In [59]:
df['review'][2]

'i thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. the plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). while some may be disappointed when they realize this is not match point 2: risk addiction, i thought it was proof that woody allen is still fully in control of the style many of us have grown to love.this was the most i\'d laughed at one of woody\'s comedies in years (dare i say a decade?). while i\'ve never been impressed with scarlet johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.this may not be the crown jewel of his career, but it was wittier than "devil wears prada" and more interesting than "superman" a great comedy to go see with friends.'

In [60]:
# Let's remove the punctuation now
punc = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','',punc))

In [61]:
df['review'] = df['review'].apply(remove_punc)

In [62]:
df['review'][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

In [63]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [64]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [65]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})

In [66]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [67]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/adii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/adii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/adii/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [69]:
stop_words = set(stopwords.words('english'))

words_to_keep = [
    'no', 
    'not', 
    'nor', 
    'neither', 
    'never', 
    'none', 
    'don',   
    "don't",
    'ain',   
    'aren',  
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    'mightn',
    "mightn't",
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'shan',
    "shan't",
    'shouldn',
    "shouldn't",
    'wasn',
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"
]


for word in words_to_keep:
    stop_words.discard(word)

In [70]:
def remove_stopwords(text):
    new_text = []
    for word in word_tokenize(text):
        if word not in stop_words:
            new_text.append(word)
        
    return ' '.join(new_text)

In [71]:
remove_stopwords(df['review'][0])

'one reviewers mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust not show faint hearted timid show pulls no punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy not high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence not violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well m

In [72]:
df['review'] = df['review'].apply(remove_stopwords)

In [73]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [74]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /home/adii/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/adii/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/adii/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/adii/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [75]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN



In [76]:
def lemmatize_with_pos(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    clean_tokens = []
    
    for word, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag) 
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        clean_tokens.append(lemma)
        
    return " ".join(clean_tokens)

In [77]:
df['review'] = df['review'].apply(lemmatize_with_pos)

In [78]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mention watch 1 oz episode youll ...,1
1,wonderful little production film technique una...,1
2,think wonderful way spend time hot summer week...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1


In [79]:
X = df['review']
y = df['sentiment']

X,y

(0        one reviewer mention watch 1 oz episode youll ...
 1        wonderful little production film technique una...
 2        think wonderful way spend time hot summer week...
 3        basically there family little boy jake think t...
 4        petter matteis love time money visually stunni...
                                ...                        
 49995    think movie right good job wasnt creative orig...
 49996    bad plot bad dialogue bad act idiotic direct a...
 49997    catholic teach parochial elementary school nun...
 49998    im go disagree previous comment side maltin on...
 49999    no one expect star trek movie high art fan exp...
 Name: review, Length: 50000, dtype: object,
 0        1
 1        1
 2        1
 3        0
 4        1
         ..
 49995    1
 49996    0
 49997    0
 49998    0
 49999    0
 Name: sentiment, Length: 50000, dtype: int64)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
# Vectorization

tfidf = TfidfVectorizer(max_features=5000)

X_train_vect = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [82]:
# naive bayes Model 

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_vect, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [83]:
naive_bayes_predict = naive_bayes.predict(X_test_vec)

In [84]:
# accuracy 
accuracy_nb = accuracy_score(y_test, naive_bayes_predict)
accuracy_nb

0.8493

In [109]:
# Logistic regression model
logistic_reg = LogisticRegression(max_iter=1000)
logistic_reg.fit(X_train_vect, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [110]:
logistic_reg_predict = logistic_reg.predict(X_test_vec)

In [111]:
accuracy_lr = accuracy_score(y_test, logistic_reg_predict)
accuracy_lr

0.8861

In [112]:
# svm

svm_model = LinearSVC(max_iter=1000)
svm_model.fit(X_train_vect,y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [113]:
svm_predict = svm_model.predict(X_test_vec)

In [114]:
# accuracy

accuracy_svm = accuracy_score(y_test, svm_predict)
accuracy_svm

0.882

### Accuracy

- Logistic Regression ~ 88.61%
- SVM ~ 88.2%
- Naive bayes ~ 84.93%

In [116]:
# dumping model

joblib.dump(logistic_reg, 'lr_model.pkl')
joblib.dump(tfidf, 'tf_vectorizer.pkl')

['tf_vectorizer.pkl']