In [226]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [232]:
df = pd.read_csv("imdbDataset.csv").iloc[:10000]

In [233]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [234]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [235]:
df.duplicated().sum()

17

In [236]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [237]:
df.drop_duplicates(inplace=True)

In [238]:
def removeHtml(text):
    return re.sub(r"<.*?/>", '', text)
    return text

df.review = df.review.apply(removeHtml)

df.review = df.review.str.lower()

In [239]:
def removePunctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df.review = df.review.apply(removePunctuation)

In [240]:
def removeUrl(text):
    pattern = re.compile(r"https://\S+|http:\S+|www:\S+")
    text = pattern.sub(r'', text)
    return text

df.review = df.review.apply(removeUrl)

In [241]:
def removeStopwords(text):
    stops = stopwords.words('english')
    textIntolist = text.split(' ')
    words = [word for word in textIntolist if word not in stops]
    text = ' '.join(words)
    return text

df.review = df.review.apply(removeStopwords)

In [242]:
df.review[0]

'one reviewers mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered middle 

In [243]:
lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    text = text.split(' ')
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

df.review = df.review.apply(lemmatization)

In [244]:
df.review[0]

'one reviewer mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turned prison b

In [245]:
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
9995,fun entertaining movie wwii german spy julie a...,positive
9996,give break anyone say good hockey movie know m...,negative
9997,movie bad movie watching endless series bad ho...,negative
9998,movie probably made entertain middle school ea...,negative


In [246]:
from sklearn.preprocessing import LabelEncoder

In [247]:
encoder = LabelEncoder()

In [248]:
y = encoder.fit_transform(df.sentiment)

In [250]:
df.sentiment = y.copy()

In [251]:
df

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1
...,...,...
9995,fun entertaining movie wwii german spy julie a...,1
9996,give break anyone say good hockey movie know m...,0
9997,movie bad movie watching endless series bad ho...,0
9998,movie probably made entertain middle school ea...,0


In [259]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(df[['review']], y, test_size=0.2, random_state=42)

In [260]:
X_train.shape

(7986, 1)

In [255]:
from sklearn.feature_extraction.text import CountVectorizer

In [256]:
cv = CountVectorizer()

In [277]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [265]:
X_train

Unnamed: 0,review
6940,back 1993 sega released dull lackluster video ...
1919,10 viewing 20 year think crazy gang best effor...
718,hard praise film much cgi dragon well done lac...
8599,rented horrible movie worst think ever seen be...
7950,spoilersone worst film ive seen since last ye...
...,...
5737,cowboy james stewart walter brennan take herd ...
5194,movie look feel put together matter dayskind l...
5393,total crapi kind excited see film version seen...
860,production quite surprise absolutely love obsc...


In [267]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

In [271]:
gnb.fit(X_train_bow, Y_train)

In [272]:
cv.vocabulary_

{'back': 5241,
 '1993': 535,
 'sega': 51883,
 'released': 48450,
 'dull': 17770,
 'lackluster': 33064,
 'video': 63049,
 'game': 23784,
 'one': 41762,
 'biggest': 6820,
 'film': 21624,
 'time': 59459,
 'quickly': 47105,
 'realizing': 47786,
 'mistake': 38003,
 'hashed': 26644,
 'different': 16030,
 'version': 62922,
 'claiming': 11222,
 'would': 65563,
 'bigger': 6817,
 'tougher': 60167,
 'betterneither': 6674,
 'slow': 54019,
 'boring': 7766,
 'gamesyou': 23820,
 'choose': 10899,
 'either': 18449,
 'dr': 17355,
 'alan': 2503,
 'grant': 25252,
 'os': 42272,
 'raptor': 47524,
 'problem': 46161,
 'go': 24716,
 'around': 4267,
 'killing': 32512,
 'army': 4250,
 'guy': 25900,
 'weirdobviously': 64229,
 'learning': 33601,
 'first': 22032,
 'really': 47791,
 'dropped': 17611,
 'ball': 5512,
 'original': 42205,
 'release': 48449,
 'socalled': 54382,
 'rampage': 47413,
 'edition': 18268,
 'slowest': 54028,
 'sluggish': 54049,
 'dullest': 17777,
 'platformers': 44811,
 'ever': 19797,
 'played':

In [273]:
X_train_bow.shape

(7986, 66484)

In [274]:
X_train_bow[0]

array([0, 0, 0, ..., 0, 0, 0])

In [278]:
pred = gnb.predict(x_test_bow)

In [282]:
from sklearn.metrics import classification_report, confusion_matrix

In [281]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.61      0.71      0.66       985
           1       0.66      0.56      0.61      1012

    accuracy                           0.63      1997
   macro avg       0.64      0.63      0.63      1997
weighted avg       0.64      0.63      0.63      1997



In [285]:
confusion_matrix(y_test, pred)

array([[696, 289],
       [443, 569]])

In [298]:
cv = CountVectorizer(ngram_range=(1,2), max_features=5000)

In [299]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [300]:
from sklearn.ensemble import RandomForestClassifier

In [301]:
rf = RandomForestClassifier()

In [302]:
rf.fit(X_train_bow, Y_train)
pred = rf.predict(x_test_bow)

In [303]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       985
           1       0.85      0.83      0.84      1012

    accuracy                           0.84      1997
   macro avg       0.84      0.84      0.84      1997
weighted avg       0.84      0.84      0.84      1997



In [304]:
cv.vocabulary_

{'back': 343,
 'released': 3659,
 'dull': 1294,
 'video': 4714,
 'game': 1823,
 'one': 3148,
 'biggest': 458,
 'film': 1606,
 'time': 4494,
 'quickly': 3538,
 'mistake': 2825,
 'different': 1163,
 'version': 4705,
 'would': 4925,
 'bigger': 457,
 'slow': 4071,
 'boring': 514,
 'choose': 749,
 'either': 1338,
 'dr': 1258,
 'alan': 156,
 'grant': 1934,
 'problem': 3470,
 'go': 1883,
 'around': 271,
 'killing': 2405,
 'army': 269,
 'guy': 1981,
 'learning': 2487,
 'first': 1696,
 'really': 3598,
 'dropped': 1283,
 'ball': 366,
 'original': 3200,
 'release': 3658,
 'socalled': 4087,
 'edition': 1324,
 'ever': 1438,
 'played': 3356,
 'video game': 4715,
 'film time': 1665,
 '10': 0,
 'viewing': 4721,
 '20': 26,
 'year': 4965,
 'think': 4453,
 'crazy': 982,
 'gang': 1825,
 'best': 434,
 'effort': 1332,
 'plot': 3367,
 'next': 3058,
 'button': 597,
 'indeed': 2220,
 'trio': 4579,
 'double': 1252,
 'act': 87,
 'thrown': 4483,
 'together': 4516,
 'mainly': 2663,
 'stage': 4170,
 'sometimes': 41

## TF-IDF

In [305]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [306]:
tfidf = TfidfVectorizer()

In [309]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
x_test_tfidf = tfidf.transform(x_test['review']).toarray()


rf = RandomForestClassifier()

rf.fit(X_train_tfidf, Y_train)

pred = rf.predict(x_test_tfidf)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       985
           1       0.86      0.85      0.85      1012

    accuracy                           0.85      1997
   macro avg       0.85      0.85      0.85      1997
weighted avg       0.85      0.85      0.85      1997



In [310]:
confusion_matrix(y_test, pred)

array([[843, 142],
       [156, 856]])

In [429]:
import gensim
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize, word_tokenize

In [430]:
story = []
for doc in df['review']:
    raw_sent = word_tokenize(doc)
    story.append(raw_sent)

In [478]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2, # a word needs to be present this many times in order to be included in the vocabulary.
    vector_size=300
)

In [479]:
model.build_vocab(story)

In [480]:
model.train(story, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)

(5490921, 5986325)

In [481]:
len(model.wv.index_to_key)

31904

In [482]:
def Avg_word2Vec(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

    

In [483]:
from tqdm import tqdm

In [484]:
X = []

In [485]:
for doc in tqdm(df.review.values):
    X.append(Avg_word2Vec(doc))

100%|██████████████████████████████████████| 9983/9983 [00:45<00:00, 218.03it/s]


In [486]:
X = np.array(X)

In [487]:
X.shape

(9983, 300)

In [488]:
from sklearn.preprocessing import LabelEncoder

In [489]:
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [490]:
y 

array([1, 1, 1, ..., 0, 0, 1])

In [491]:
X_train, x_test, Y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [492]:
rf = RandomForestClassifier()

In [493]:
rf.fit(X_train, Y_train)

pred = rf.predict(x_test)


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.79       985
           1       0.80      0.80      0.80      1012

    accuracy                           0.80      1997
   macro avg       0.80      0.80      0.80      1997
weighted avg       0.80      0.80      0.80      1997

