# New Section

In [None]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import sequential
from keras.layers import Embedding,LSTM,Dense,Dropout
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, SimpleRNN
import tensorflow as tf


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')
dt=pd.read_csv('/content/drive/MyDrive/Dataset/IMDB Dataset.csv')

Mounted at /content/drive


In [None]:
dt.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
dt.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [None]:
dt['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
dt['review'].head(10)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
5    Probably my all-time favorite movie, a story o...
6    I sure would like to see a resurrection of a u...
7    This show was an amazing, fresh & innovative i...
8    Encouraged by the positive comments about this...
9    If you like original gut wrenching laughter yo...
Name: review, dtype: object

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
dt['review']=dt['review'].apply(denoise_text)

In [None]:
dt['review'].head(10)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. The filming tec...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
5    Probably my all-time favorite movie, a story o...
6    I sure would like to see a resurrection of a u...
7    This show was an amazing, fresh & innovative i...
8    Encouraged by the positive comments about this...
9    If you like original gut wrenching laughter yo...
Name: review, dtype: object

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
dt['review']=dt['review'].apply(remove_special_characters)

In [None]:
dt['review'].head(10)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production The filming tech...
2    I thought this was a wonderful way to spend ti...
3    Basically theres a family where a little boy J...
4    Petter Matteis Love in the Time of Money is a ...
5    Probably my alltime favorite movie a story of ...
6    I sure would like to see a resurrection of a u...
7    This show was an amazing fresh  innovative ide...
8    Encouraged by the positive comments about this...
9    If you like original gut wrenching laughter yo...
Name: review, dtype: object

In [None]:
#set stopwords to english
stop=set(stopwords.words('english'))


#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
dt['review']=dt['review'].apply(remove_stopwords)

In [None]:
dt['review'].head(10)

0    One reviewers mentioned watching 1 Oz episode ...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    Basically theres family little boy Jake thinks...
4    Petter Matteis Love Time Money visually stunni...
5    Probably alltime favorite movie story selfless...
6    sure would like see resurrection dated Seahunt...
7    show amazing fresh innovative idea 70s first a...
8    Encouraged positive comments film looking forw...
9    like original gut wrenching laughter like movi...
Name: review, dtype: object

In [None]:
corpus = []
for text in dt['review']:
  words=[word.lower()for word in word_tokenize(text)]
  corpus.append(words)

In [None]:
num_words=len(corpus)
print(num_words)

50000


In [None]:
train_size=int(dt.shape[0]*0.9)
x_train=dt.review[:train_size]
y_train=dt.sentiment[:train_size]

x_test=dt.review[train_size:]
y_test=dt.sentiment[train_size:]

In [None]:
tokenizer=Tokenizer(num_words)
tokenizer.fit_on_texts(x_train)

x_train=tokenizer.texts_to_sequences(x_train)
x_train=pad_sequences(x_train ,maxlen=128, truncating='post',padding='post')

In [None]:
x_train[0],len(x_train[0])

(array([    3,  1817,   934,    56,   397,  3080,   288,   343,  3044,
          106,   479,   470,  7262,    19,    57,  3118,  3080,  5429,
        14167,    50,   478,   179,   106,   559,    52,  1591,    41,
         8381,  5585, 11185,    41,  2269,  5724,  5402,  1333,   276,
          478,  3294,   244,   234,   352,  3080, 11031,   242, 14815,
         6620,  2436,   946,  2465,  1261, 24159,   420,  4694,  2325,
         1056,  6510,  2827, 12355,   293, 18637,   216,  4882,  3500,
          420,   232,  7807, 37767, 14492,  4855,  7883,  2257, 15907,
          225,  8965,  7021, 12135,  8216, 37768,    34,   130, 20596,
            7,    47,   168,  1158,    41,   545,    91,   158,   154,
          438,  2942,   700,    83,  1150,  4216,  2331,   944,   700,
         1275,   700,    58,   860,    88,    19,   288,    43,   105,
         3118,  1425,  2068,   290,    47,  1413,   172,  1337,  1141,
         3080,    89,  9554,   216,  1984,  2025,   478,   478,  7591,
      

In [None]:
x_test=tokenizer.texts_to_sequences(x_test)
x_test=pad_sequences(x_test ,maxlen=128, truncating='post',padding='post')

In [None]:
x_test[0],len(x_test[0])

(array([  383,     2,  1155,  3145,  6016,   574,   302, 17647,   550,
          122,   539,   332,   184,    11,    61,  9276,  2630,   757,
           32,  1810,   205,  3007, 14087,   923,  1486,  3145,   149,
          638,    35,   114,   157,    73,  7940,    23,  4950,  2348,
          166,    15,  6453,   282,  4144,  2429, 15124,   331,    35,
         8916,  1411, 23297,   274, 45764,  8752, 46269,   174,     5,
         3145,  1453,  5771,     3,  3145,  8270,   184,  3951,     3,
           22,     7,     4,   649,   268,    43,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [None]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(45000, 128) (45000,)
(5000, 128) (5000,)


In [None]:
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [None]:
model.summary()

In [None]:
embedding_dim = 300
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,output_dim=100
                     ,input_length=128,trainable=True),
    tf.keras.layers.LSTM(units=64, activation='tanh', return_sequences=True),
    tf.keras.layers.LSTM(units=32, activation='tanh'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

lstm_model_history = lstm_model.fit(x_train,y_train,
                                    validation_data=(x_test,y_test),
                                    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 100)          5000000   
                                                                 
 lstm (LSTM)                 (None, 128, 64)           42240     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 5,054,689
Trainable params: 5,054,689
Non-trainable params: 0
_________________________________________________________________


In [None]:
embedding_dim = 300
simple_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,output_dim=100
                     ,input_length=128,trainable=True),
    #tf.keras.layers.Flatten(),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

simple_model.compile(loss='binary_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

simple_model_history = simple_model.fit(x_train,y_train,
                                        validation_data=(x_test,y_test),
                                        epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
simple_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 100)          5000000   
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_1 (Dense)             (None, 256)               25856     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                      

In [None]:
embedding_dim = 300
simple_rnn_model = tf.keras.models.Sequential([
     tf.keras.layers.Embedding(input_dim=num_words,output_dim=100
                     ,input_length=128,trainable=True),
     tf.keras.layers.SimpleRNN(units=64, activation='tanh', return_sequences=True),
     tf.keras.layers.SimpleRNN(units=32, activation='tanh'),

     tf.keras.layers.Dense(1, activation='sigmoid')
 ])

simple_rnn_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])

simple_rnn_model_history = simple_rnn_model.fit(x_train,y_train,
                                                 validation_data=(x_test,y_test),
                                                 epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
simple_rnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 128, 100)          5000000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 128, 64)           10560     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                3104      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 5,013,697
Trainable params: 5,013,697
Non-trainable params: 0
_________________________________________________________________


In [None]:
embedding_dim = 300
gru_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,output_dim=100
                     ,input_length=128,trainable=True),
    tf.keras.layers.GRU(units=64, activation='tanh', return_sequences=True),
    tf.keras.layers.GRU(units=32, activation='tanh'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

gru_model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

gru_model_history = gru_model.fit(x_train,y_train,
                                  validation_data=(x_test,y_test),
                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
embedding_dim = 300
conv1d_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,output_dim=100
                     ,input_length=128,trainable=True),
    
    tf.keras.layers.Conv1D(64, 7, activation='relu'),
    tf.keras.layers.MaxPool1D(5),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.GlobalMaxPool1D(),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

conv1d_model.compile(loss='binary_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

conv1d_model_history = conv1d_model.fit(x_train,y_train,
                                        validation_data=(x_test,y_test),
                                        epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
