In [117]:
from keras.datasets import imdb
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('./IMDB Dataset.csv')

data['review'] = data['review'].str.lower()


In [4]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [6]:
def remove_stopwords(data):
    data['review without stopwords'] = data['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result


In [7]:
data_without_stopwords = remove_stopwords(data)
data_without_stopwords['clean_review']= data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

In [8]:
data_without_stopwords.head()

Unnamed: 0,review,sentiment,review without stopwords,clean_review
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching just 1 oz epi...,one reviewers mentioned watching just 1 oz epi...
1,a wonderful little production. <br /><br />the...,positive,wonderful little production. <br /><br />the f...,wonderful little production the filming techn...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically family little boy (jake) thinks zomb...,basically family little boy jake thinks zomb...
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love time money"" visually stu...",petter mattei s love time money visually stu...


In [9]:

reviews = data_without_stopwords['clean_review']
reviews

0        one reviewers mentioned watching just 1 oz epi...
1        wonderful little production  the filming techn...
2        thought wonderful way spend time hot summer we...
3        basically family little boy  jake  thinks zomb...
4        petter mattei s  love time money  visually stu...
                               ...                        
49995    thought movie right good job  wasn t creative ...
49996    bad plot  bad dialogue  bad acting  idiotic di...
49997    catholic taught parochial elementary schools n...
49998    going disagree previous comment side maltin on...
49999    no one expects star trek movies high art  fans...
Name: clean_review, Length: 50000, dtype: object

In [10]:
reviews_list = []
for i in range(len(reviews)):
    reviews_list.append(reviews[i])



In [11]:
sentiment = data_without_stopwords['sentiment']

In [12]:
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiment)))

In [13]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [14]:
X_train, X_test,Y_train, Y_test = train_test_split(reviews_list, y, test_size=0.2, random_state = 45)

In [66]:
featFile = open('train_tweets_us_all.text', "r", encoding="utf-8");
labelFile = open('train_tweets_us_all.labels', "r", encoding="utf-8");

testFeat = open('us_test.text', "r", encoding="utf-8");
testLabel = open('us_test.labels', "r", encoding="utf-8");


X_train = featFile.readlines();
Y_train = labelFile.readlines();

X_test = testFeat.readlines();
Y_test = testLabel.readlines();

In [67]:
len(Y_train)
len(X_train)

print(Y_train[0:5])

['17\n', '0\n', '18\n', '1\n', '9\n']


In [36]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [40]:
words_to_index = tokenizer.word_index

In [41]:
len(words_to_index)

297422

In [42]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)


    return word_to_vec_map
    

In [43]:
word_to_vec_map = read_glove_vector('./glove.6B.50d.txt')

In [74]:
maxLen = 50


In [75]:

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)






In [132]:
def imdb_rating(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices)

    X = LSTM(128, return_sequences=True)(embeddings)

    X = Dropout(0.6)(X)

    X = LSTM(128, return_sequences=True)(X)

    X = Dropout(0.6)(X)

    X = LSTM(128)(X)

    X = Dense(20, activation='sigmoid')(X)

    model = Model(inputs=X_indices, outputs=X)

    return model

In [76]:
def conv1d_model(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices)

    X = Conv1D(512,3,activation='relu')(embeddings)

    X = MaxPooling1D(3)(X)

    X = Conv1D(256,3,activation='relu')(X)

    X = MaxPooling1D(3)(X)

    X = Conv1D(256,3,activation='relu')(X)
    X = Dropout(0.8)(X)
    X = MaxPooling1D(3)(X)

    X = GlobalMaxPooling1D()(X)

    X = Dense(256, activation='relu')(X)
    X = Dense(1, activation='sigmoid')(X)

    model = Model(inputs=X_indices, outputs=X)

    return model
                                    
                      
              
                                      




In [133]:
model = imdb_rating((maxLen,))
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 50, 50)            14871100  
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 128)           91648     
_________________________________________________________________
dropout_10 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 50, 128)           131584    
_________________________________________________________________
dropout_11 (Dropout)         (None, 50, 128)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               1315

In [124]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

In [125]:
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(405987, 50)

In [126]:
Y_train_int = np.array(list(map(int, Y_train)))
print(Y_train_int[0:5])
# print(X_train_indices[0:5])

Y_train_test = np_utils.to_categorical(Y_train_int)

print(Y_train_test[0:5])


[17  0 18  1  9]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [134]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_indices, Y_train_test, batch_size=1024, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15

In [78]:
model_1d = conv1d_model((maxLen,))
model_1d.summary()

ValueError: Negative dimension size caused by subtracting 3 from 2 for '{{node max_pooling1d_11/MaxPool}} = MaxPool[T=DT_FLOAT, data_format="NHWC", explicit_paddings=[], ksize=[1, 3, 1, 1], padding="VALID", strides=[1, 3, 1, 1]](max_pooling1d_11/ExpandDims)' with input shapes: [?,2,1,256].

In [127]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_1d.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [128]:
model_1d.fit(X_train_indices, Y_train_test, batch_size=256, epochs=5)

Epoch 1/5


ValueError: in user code:

    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\Chris Frendo\Anaconda3\envs\keras\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:274 assert_input_compatibility
        ', found shape=' + display_shape(x.shape))

    ValueError: Input 0 is incompatible with layer model_2: expected shape=(None, 150), found shape=(None, 50)


In [None]:
X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [None]:
model.evaluate(X_test_indices, Y_test)

In [None]:
model_1d.evaluate(X_test_indices, Y_test)

In [None]:
preds = model_1d.predict(X_test_indices)

In [None]:
n = np.random.randint(0,9999)

X_test[n]


In [None]:
if preds[n] > 0.5:
    print('predicted sentiment : positive')
else: 
    print('precicted sentiment : negative')

if (Y_test[n] == 1):
    print('correct sentiment : positive')
else:
    print('correct sentiment : negative')


In [None]:
preds[n]

In [None]:
Y_test[n]

In [None]:
model_1d.save_weights('/content/drive/My Drive/imdb_weights_con1vd.hdf5')

In [None]:
reviews_list_idx = tokenizer.texts_to_sequences(reviews_list)

In [None]:
def add_score_predictions(data, reviews_list_idx):

    data['sentiment score'] = 0

    reviews_list_idx = pad_sequences(reviews_list_idx, maxlen=maxLen, padding='post')

    review_preds = model.predict(reviews_list_idx)

    data['sentiment score'] = review_preds

    pred_sentiment = np.array(list(map(lambda x : 'positive' if x > 0.5 else 'negative',review_preds)))

    data['predicted sentiment'] = 0

    data['predicted sentiment'] = pred_sentiment

    return data

  

In [None]:
data = add_score_predictions(data, reviews_list_idx)

In [None]:
data