In [19]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd


#nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup libraray
from bs4 import BeautifulSoup

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report

#preprocessing scikit

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
 
#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

#keras
# import keras
from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Flatten ,Embedding,Input,LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

In [20]:
rev_frame=pd.read_csv(r'./imdb.csv')
df=rev_frame.copy()


In [21]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [22]:
df= df[["review","sentiment"]]
# df["review"]=df["review"]
df["rating"]=df["sentiment"].apply(lambda x: 1 if x=="positive" else 0)
df.drop("sentiment",axis=1,inplace=True)
df.head()

Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [23]:
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [24]:
df["rating"].isnull().sum()
df.drop_duplicates(subset=["rating","review"],keep="first",inplace=True)
print(df.shape)
df.head()


(49582, 2)


Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [25]:
for review in df['review'][:5]:
    print(review+'\n'+'\n')

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [26]:
df.head()

Unnamed: 0,review,rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [27]:
df["sentiment"]=df["rating"]

In [28]:
df.drop("rating",axis=1,inplace=True)

In [29]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [30]:
df["sentiment"].value_counts()  

1    24884
0    24698
Name: sentiment, dtype: int64

In [31]:
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [32]:
pos_df=df.loc[df.sentiment==1,:][:50000]
neg_df=df.loc[df.sentiment==0,:][:50000]

In [33]:
pos_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1


In [34]:
neg_df.head()

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,0
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
10,Phil the Alien is one of those quirky films wh...,0
11,I saw this movie when I was about 12 when it c...,0


In [35]:
df=pd.concat([pos_df,neg_df],ignore_index=True)

In [36]:
print(df.shape)
df.head()

(49582, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,"Petter Mattei's ""Love in the Time of Money"" is...",1
4,"Probably my all-time favorite movie, a story o...",1


In [37]:
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape) 
df.head()

(49582, 2)


Unnamed: 0,review,sentiment
0,I tried. God knows I tried to like this Swiss ...,0
1,A truly masterful piece of filmmaking. It mana...,0
2,"I loved the first ""American Graffiti"" with all...",0
3,"I'm sorry, but ""Star Wars Episode 1"" did not d...",1
4,...at least during its first half. If it had s...,0


In [38]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0
for review in df['review']:
    sents=tokenizer.tokenize(review.strip())
    sum+=len(sents)
    for sent in sents:
        cleaned_sent=clean_reviews(sent)
        sentences.append(cleaned_sent.split()) # can use word_tokenize also.
print(sum)
print(len(sentences)) 

532586
532586


In [39]:
for te in sentences[:5]:
    print(te,"\n")

['tried'] 

['god', 'know', 'tried', 'like', 'swiss', 'cheese', 'movie', 'story', 'full', 'hole', 'big', 'enough', 'drive', 'horse', 'drawn', 'carriage'] 

['acting', 'overall', 'even', 'character', 'endearing', 'enough', 'regretted', 'died', 'like', 'recently', 'sprayed', 'roach', 'scattering', 'die', 'gruesome', 'death'] 

['overall', 'however', 'really', 'scary'] 

['afterall', 'seen', 'spooky', 'quickly', 'moving', 'figure', 'background', 'since', 'brood', 'back', 'scary', 'briefly'] 



In [40]:
import gensim
w2v_model=gensim.models.Word2Vec(sentences=sentences,vector_size=300,window=10,min_count=1)

In [41]:
w2v_model.train(sentences,epochs=10,total_examples=len(sentences))

(55753861, 58626910)

In [42]:
w2v_model.wv.get_vector('like')

array([-1.37441111e+00,  3.08489472e-01, -8.67421508e-01,  3.66287649e-01,
        8.83665204e-01,  1.41539723e-01, -4.90213782e-01, -1.93543985e-01,
       -1.40334070e+00, -1.18361628e+00,  1.98303491e-01, -3.53452086e-01,
        4.69485074e-02,  6.41956508e-01, -7.04664946e-01,  2.26181120e-01,
        5.97196102e-01, -4.30685341e-01,  8.01428437e-01, -2.80584306e-01,
       -5.25208235e-01,  1.02479804e+00,  2.95345008e-01, -5.61985672e-02,
       -7.47274235e-02, -6.97148621e-01,  1.03884051e-02,  1.74897456e+00,
        8.10952485e-01, -1.88209936e-01, -2.48128384e-01,  2.86445826e-01,
       -7.94592857e-01,  1.65374860e-01, -1.51221845e-02, -2.05292739e-02,
       -5.61785400e-01,  9.42707121e-01, -6.10031486e-01,  3.21156591e-01,
        2.57688975e+00,  1.17260027e+00, -1.07648909e+00, -6.89576864e-01,
        4.52195019e-01,  5.45259286e-03, -1.65366983e+00,  4.57904279e-01,
        4.31134105e-01,  4.38673496e-01, -7.16490686e-01,  1.45987082e+00,
        3.65763932e-01,  

In [43]:
w2v_model.wv.similarity('good','like')

0.22066134

In [44]:
vocab=list(w2v_model.wv.key_to_index.keys())
print("The total number of words are : ",len(vocab))

The total number of words are :  91731


In [45]:
w2v_model.wv.most_similar('like')

[('alike', 0.4151833653450012),
 ('likethe', 0.40006572008132935),
 ('liqueur', 0.3776463568210602),
 ('reminded', 0.35796523094177246),
 ('fullframe', 0.34814512729644775),
 ('hey', 0.3428112268447876),
 ('wwwwwwwaaaaaaaaaaaayyyyyyyyyyy', 0.33833152055740356),
 ('cupidor', 0.33484983444213867),
 ('movieanyways', 0.33254683017730713),
 ('ulma', 0.3197789192199707)]

In [46]:
w2v_model.wv.similarity('good','like')

0.22066134

In [47]:
print("The no of words :",len(vocab))

The no of words : 91731


In [48]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))


The no of key-value pairs :  91731


In [49]:

for word in vocab[:5]:
  print(word_vec_dict[word])

[-0.25376943  0.20972693 -0.18709087 -0.15608032 -0.14511885  0.14971085
  0.12208346 -0.5003075  -0.7488885   0.46729234  0.2679968   1.1100959
 -0.17435041 -0.26999182 -0.19655494 -0.5030448   1.9317528  -0.26780167
 -0.49663347 -1.1961558  -0.07004051  0.0118979  -0.45718047  0.4884793
  0.09873531  0.48907503  1.1510031   0.19667779 -0.73295015 -0.51429826
 -0.30763856  0.02015868  0.27912217 -0.47253174 -0.02807384  0.2922777
  1.8012608   0.25209442 -0.23698469 -0.15904188  1.139602    0.14612818
 -0.10012694 -0.02799961  0.42243105  1.3335336   0.3228245   0.9075558
 -0.99512845  1.2488867  -0.17912629 -0.13617982  0.08084754  0.7221096
  1.0414921  -0.24974643 -1.5298917  -0.10139339  0.42977327 -1.1003965
 -0.2610665  -0.24094835  0.22026208 -0.88645923 -0.62157494  0.18178257
 -0.7307668   0.55484295  1.5732284   0.1666967   1.1594497   0.1934297
 -1.0521734   0.01447807 -0.6823334  -0.8333639   0.14069392  0.5507962
 -0.19398601 -1.4705725   0.6528619  -0.14050595  0.1209913

In [50]:
df['clean_review']=df['review'].apply(clean_reviews)

In [51]:
maxi=-1
for i,rev in enumerate(df['clean_review']):
    tokens=rev.split()
    if(len(tokens)>maxi):
        maxi=len(tokens)
print(maxi)

1416


In [52]:
tok = Tokenizer()
tok.fit_on_texts(df['clean_review'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean_review'])

In [53]:
max_rev_len=1565  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=300 # embedding dimension as choosen in word2vec constructor

In [54]:
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape 

(49582, 1565)

In [55]:
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector

In [56]:
print(embed_matrix[14])

[-5.60551405e-01  1.08146620e+00 -2.47498894e+00  1.94485709e-01
 -6.96085513e-01 -7.61532605e-01 -5.96140862e-01 -2.55405843e-01
 -1.53241873e+00 -8.27321291e-01  1.17890753e-01  2.05288097e-01
 -7.03116536e-01  1.69238180e-01  8.45117867e-01 -6.23604715e-01
  4.45675641e-01 -1.44507974e-01  2.04112232e-01 -7.61811674e-01
  2.88676560e-01  2.30818462e+00 -9.49603140e-01  1.11701973e-01
 -1.54446708e-02 -1.22321999e+00 -1.02290046e+00  8.40136707e-01
  5.09345531e-01 -4.44202900e-01  7.62804151e-01 -1.10190190e-01
 -3.88903767e-01  9.06815827e-01 -3.75469893e-01  2.67408818e-01
 -1.14457250e-01  4.18266535e-01  3.67718428e-01 -7.28634357e-01
  1.23401976e+00  7.14513421e-01 -8.81322145e-01  7.30834246e-01
  1.91647753e-01  4.06617612e-01  7.13097095e-01 -9.98618174e-03
  9.47039604e-01 -3.31332326e-01 -5.61643064e-01  1.07307076e+00
  9.30146813e-01  3.13503981e-01  8.34563971e-01  2.23973137e-03
 -2.23661378e-01 -1.87813890e+00 -2.33411565e-01 -1.33717704e+00
  7.34487057e-01  7.95362

In [57]:
# prepare train and val sets first
Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.20,random_state=42)


In [58]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_rev_len,embeddings_initializer=Constant(embed_matrix)))
# model.add(CuDNNLSTM(64,return_sequences=False)) # loss stucks at about 
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
# model.add(Dense(16,activation='relu'))
model.add(Dropout(0.20))
model.add(Dense(2,activation='sigmoid'))

In [59]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1565, 300)         27519600  
                                                                 
 flatten (Flatten)           (None, 469500)            0         
                                                                 
 dense (Dense)               (None, 16)                7512016   
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 34        
                                                                 
Total params: 35031650 (133.64 MB)
Trainable params: 350

In [60]:
from keras.optimizers import RMSprop
model.compile(optimizer="RMSprop",loss='binary_crossentropy',metrics=['accuracy'])


In [61]:
epochs=5
batch_size=64

In [62]:
model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fabbbba43a0>

In [63]:
model.evaluate(x_test,y_test)



[0.3870149850845337, 0.8504588007926941]

In [67]:
test_sent="The movie was very good and I loved it"
# test_sent="The movie was very bad and I hated it"
test_sent=clean_reviews(test_sent)
prediction = model.predict(pad_sequences(tok.texts_to_sequences([test_sent]),maxlen=max_rev_len,padding='post'))

label = ["NEgative", "Positive"]
print(label[np.argmax(prediction[0])])


Positive


: 