##**Probelm-3 Sentiment Analysis using ImDb Dataset**

####**Importing the Imdb dataset**

In [None]:
#Importing the dataset
import os
import nltk 
import urllib.request as req
import tarfile
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords 

from nltk.stem.porter import PorterStemmer 

#Downloading the ImDb dataset from the given link directly:
imdb_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

aclImdb_file = "aclImdb_v1.tar.gz"
if not os.path.exists(aclImdb_file):
    req.urlretrieve(imdb_url, aclImdb_file)
    
unzip_folder = "aclImdb"
if not os.path.exists(unzip_folder):
    with tarfile.open(aclImdb_file) as tar:
        tar.extractall()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def get_reviews(data_folder="/train"):
    reviews = []
    labels = []
    ps = PorterStemmer()
    for index,sentiment in enumerate(["/neg/", "/pos/"]):
        path = unzip_folder + data_folder + sentiment
        for filename in sorted(os.listdir(path)):
            with open(path + filename, 'r') as f:
                review = f.read()
                review = review.lower()
                review = review.replace("<br />", " ")
                review = re.sub(r"[^a-z ]", " ", review) 
                review = re.sub(r" +", " ", review)
                review = review.split(" ")
                #review = review.split()
                review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
                #review = ' '.join(review)
                reviews.append(review)
                
                label = [0, 0]
                label[index] = 1
                labels.append(label)
    #print(reviews[0])            
    return reviews, np.array(labels)

def get_test_reviews(data_folder="/test"):
    test_reviews = []
    test_labels = []
    ps = PorterStemmer()
    for index,sentiment in enumerate(["/neg/", "/pos/"]):
        path = unzip_folder + data_folder + sentiment
        for filename in sorted(os.listdir(path)):
            with open(path + filename, 'r') as f:
                test_review = f.read()
                test_review = test_review.lower()
                test_review = test_review.replace("<br />", " ")
                test_review = re.sub(r"[^a-z ]", " ", test_review) 
                test_review = re.sub(r" +", " ", test_review)
                test_review = test_review.split(" ")
                #review = review.split()
                test_review = [ps.stem(word) for word in test_review if not word in set(stopwords.words('english'))]
                #review = ' '.join(review)
                test_reviews.append(test_review)
                
                test_label = [0, 0]
                test_label[index] = 1
                test_labels.append(test_label)
    #print(reviews[0])            
    return test_reviews, np.array(test_labels)

In [None]:

train_reviews, train_labels = get_reviews()
print(len(train_reviews))
print(len(train_labels))
print(train_reviews[0])
print(train_reviews[1])
print(train_labels[0])
print(train_labels[1])

25000
25000
['stori', 'man', 'unnatur', 'feel', 'pig', 'start', 'open', 'scene', 'terrif', 'exampl', 'absurd', 'comedi', 'formal', 'orchestra', 'audienc', 'turn', 'insan', 'violent', 'mob', 'crazi', 'chant', 'singer', 'unfortun', 'stay', 'absurd', 'whole', 'time', 'gener', 'narr', 'eventu', 'make', 'put', 'even', 'era', 'turn', 'cryptic', 'dialogu', 'would', 'make', 'shakespear', 'seem', 'easi', 'third', 'grader', 'technic', 'level', 'better', 'might', 'think', 'good', 'cinematographi', 'futur', 'great', 'vilmo', 'zsigmond', 'futur', 'star', 'salli', 'kirkland', 'freder', 'forrest', 'seen', 'briefli', '']
['airport', 'start', 'brand', 'new', 'luxuri', 'plane', 'load', 'valuabl', 'paint', 'belong', 'rich', 'businessman', 'philip', 'steven', 'jame', 'stewart', 'fli', 'bunch', 'vip', 'estat', 'prepar', 'open', 'public', 'museum', 'also', 'board', 'steven', 'daughter', 'juli', 'kathleen', 'quinlan', 'son', 'luxuri', 'jetlin', 'take', 'plan', 'mid', 'air', 'plane', 'hi', 'jack', 'co', 'pilo

In [None]:
test_reviews, test_labels = get_test_reviews()
print(len(test_reviews))
print(len(test_labels))

25000
25000


In [None]:
from keras.preprocessing.text import Tokenizer
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary i.e ->most frequently used words
tk = Tokenizer(num_words=NB_WORDS,
filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',lower=True, split=" ")
tk.fit_on_texts(train_reviews)
X_train_seq = tk.texts_to_sequences(train_reviews)



In [None]:
print(X_train_seq[0])
print(X_train_seq[1])

[13, 55, 5318, 62, 2748, 86, 246, 18, 1143, 355, 1266, 104, 6805, 5694, 177, 94, 1461, 958, 2502, 796, 6189, 1411, 352, 435, 1266, 143, 6, 256, 1157, 705, 8, 139, 14, 861, 94, 9493, 335, 15, 8, 1606, 39, 687, 711, 6371, 1025, 448, 58, 155, 30, 7, 565, 615, 26, 615, 76, 2867, 8582, 5380, 47, 2868, 4]
[3653, 86, 2706, 80, 4775, 1327, 1543, 3613, 974, 1663, 869, 4037, 2378, 1472, 502, 1103, 802, 675, 2855, 1371, 246, 826, 3458, 27, 1482, 1472, 418, 1966, 5566, 374, 4775, 48, 655, 1520, 661, 1327, 4921, 583, 850, 1392, 5079, 491, 42, 7858, 9494, 2169, 402, 1557, 3704, 918, 945, 2105, 655, 897, 3613, 8791, 911, 1327, 2070, 2513, 901, 8, 3791, 5079, 138, 378, 2674, 6908, 3107, 668, 797, 1327, 1177, 1383, 1639, 2287, 1180, 112, 2644, 570, 4513, 661, 257, 3044, 790, 7024, 1318, 188, 202, 4514, 2308, 3285, 162, 6, 622, 173, 27, 496, 988, 148, 3653, 206, 525, 2911, 378, 1400, 521, 3653, 96, 1320, 7025, 5, 3441, 38, 3653, 326, 1408, 237, 161, 186, 2365, 112, 133, 209, 3653, 2, 47, 150, 68, 5, 3, 

In [None]:
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 10  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 100  # Maximum number of words in a sequence
GLOVE_DIM = 100  # Number of dimensions of the GloVe word embeddings

max_features = 10000
maxlen = 100
batch_size = 64
embedding_dims = 16
filters = 128 # was 128
kernel_size = 3
epochs = 3
#Equal length of sequences:
from keras.preprocessing.sequence import pad_sequences

X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
print(X_train_seq_trunc[0])
print(X_train_seq_trunc[10])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0   13   55 5318
   62 2748   86  246   18 1143  355 1266  104 6805 5694  177   94 1461
  958 2502  796 6189 1411  352  435 1266  143    6  256 1157  705    8
  139   14  861   94 9493  335   15    8 1606   39  687  711 6371 1025
  448   58  155   30    7  565  615   26  615   76 2867 8582 5380   47
 2868    4]
[  59 3615   92 7148 1502 4039  172  786    1 1190   33  274 1198  147
 1520 2440  786 3288  877  539   24  116  339 5080  824 1125 2277   99
   71   49  885  477  331  946    3  952  147    7  133   73  199  260
 3418   30   13  757   96  287   40   60  175    1   24  158   22 2322
  167    2  329  209  375  348 2408 4822 3821 8203  913 1822  209  367
    8    1 3123   99  209  220 2277 1301   16   24    2    3   36   56
    7  118 1118 1939  119  834    5  209  348   12   67  260    1

In [None]:
from keras import models
from keras import layers
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPool1D, Flatten, MaxPool1D, Dropout
'''
emb_model = models.Sequential()
emb_model.add(layers.Embedding(NB_WORDS, 8, input_length=MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(3, activation='softmax'))

max_features = 5000
maxlen = 100
batch_size = 64
embedding_dims = 16
filters = 128
kernel_size = 3
epochs = 3


x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
'''
epochs = 5
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(Conv1D(64, 3, border_mode='same'))
model.add(Conv1D(32, 3, border_mode='same'))
model.add(Conv1D(16, 3, border_mode='same'))
model.add(GlobalMaxPool1D())
#model.add(MaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(128, activation='sigmoid'))
#model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='sigmoid'))





In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model = model.save('/content/20868337_NLP_model')
print(X_train_seq_trunc.shape)
print(train_labels)

(25000, 100)
[[1 0]
 [1 0]
 [1 0]
 ...
 [0 1]
 [0 1]
 [0 1]]


In [None]:
from tensorflow import keras
saved_model = keras.models.load_model('/content/20868337_NLP_model')
saved_model.fit(X_train_seq_trunc, train_labels, batch_size=batch_size, epochs=epochs)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f1450358128>

In [None]:
from keras.preprocessing.text import Tokenizer
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary i.e ->most frequently used words
tk = Tokenizer(num_words=NB_WORDS,
filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',lower=True, split=" ")
tk.fit_on_texts(test_reviews)
X_test_seq = tk.texts_to_sequences(test_reviews)

print(X_test_seq[0])
print(X_test_seq[1])

VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 10  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 100  # Maximum number of words in a sequence
GLOVE_DIM = 100  # Number of dimensions of the GloVe word embeddings

max_features = 10000
maxlen = 100
batch_size = 64
embedding_dims = 16
filters = 128
kernel_size = 3
epochs = 3
#Equal length of sequences:
from keras.preprocessing.sequence import pad_sequences

X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)
print(X_test_seq_trunc[0])
print(X_test_seq_trunc[10])


from tensorflow import keras
reconstructed_NLP_model = keras.models.load_model('/content/20868337_NLP_model')
score = reconstructed_NLP_model.evaluate(X_test_seq_trunc, test_labels, batch_size=32, verbose=1)

print(reconstructed_NLP_model.metrics_names)
print(score)
print("NLP model - Testing loss = {:2f}".format(score[0]))
print("NLP model - Testing accuracy = {:2f}".format(score[1]*100))

[313, 4522, 946, 1, 141, 1009, 1627, 1034, 1221, 1185, 1365, 349, 243, 9, 104, 1073, 3246, 4522, 9, 438, 352, 1372, 21, 245, 6, 243, 9, 16, 243, 6863, 6266, 5406, 213, 47, 132, 31, 57, 163, 257, 107, 19, 1133, 8813, 3246, 5691, 190, 623, 4522, 137, 18, 437, 216, 33, 90, 2797, 4522, 140, 104, 5406, 1073, 490, 5406, 1866, 52, 2213, 5771, 923, 37, 184, 95, 234, 4]
[364, 450, 108, 2, 250, 185, 16, 81, 195, 11, 134, 229, 825, 3394, 297, 1488, 1488, 9054, 7871, 41, 6, 1723, 29, 29, 18, 705, 3, 27, 10, 93, 567, 431, 11, 93, 878, 4033, 1488, 1328, 1962, 1946, 395, 1975, 1488, 9054, 10, 72, 389, 1488, 346, 470, 327, 585, 8, 2, 3372, 2315, 60, 111, 1811, 695, 4997, 2, 1811, 136, 30, 2136, 9, 542, 8585, 84, 2, 44, 695, 4997, 30, 2136, 1347, 9, 145, 542, 9, 237, 35, 630, 373, 221, 217, 108, 998, 3276, 57, 2, 10, 16, 46, 10, 3, 11, 5259, 212, 1253, 7550, 620, 57, 29, 57, 126, 34, 35, 195, 11, 462, 241, 269, 565, 133, 2137, 47, 325, 8, 346, 2, 100, 4]
[   0    0    0    0    0    0    0    0    0   

In [None]:
import tensorflow as tf

from keras.datasets import imdb

(X_train,y_train), (X_test,y_test) = tf.keras.datasets.imdb.load_data(
    path='imdb.npz', num_words=None, skip_top=0, maxlen=None, seed=113,
    start_char=1, oov_char=2, index_from=3)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Using TensorFlow backend.


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
(25000,)
(25000,)
(25000,)
(25000,)


In [None]:
import tensorflow as tf

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPool1D
from keras.datasets import imdb

max_features = 5000
maxlen = 100
batch_size = 64
embedding_dims = 16
filters = 128
kernel_size = 3
epochs = 3

(x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

model = tf.keras.models.Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f2129347550>