In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing packages**

In [2]:
import re 
import nltk 
from nltk.corpus import stopwords 
from numpy import array 
from keras.preprocessing.text import one_hot 
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Model 
from keras.layers.core import Activation, Dropout, Dense 
from keras.layers import Flatten
from keras.layers import Conv1D 
from keras.layers import GlobalMaxPooling1D,MaxPooling1D
from keras.layers import LSTM,GRU
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split 
from keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors


**Reading the data**

In [3]:
train=pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
train.shape

In [4]:
train.isnull().sum()

In [5]:
train.head(5)

In [6]:
import seaborn as sns
ax=sns.countplot(x="sentiment", data=train)


In [7]:
train['review'].iloc[3]

In [8]:
def remove_tags(text):
 return TAG_RE.sub('', text)
def preprocess_text(sen):
# Removing html tags
 sentence = remove_tags(sen)
# Remove punctuations and numbers
 sentence = re.sub('[^a-zA-Z]', ' ', sentence)
# Single character removal
 sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
 sentence = re.sub(r'\s+', ' ', sentence)
 return sentence

TAG_RE = re.compile(r'<[^>]+>')


In [9]:
X = []
sentences = list(train['review'])
for sen in sentences:
 X.append(preprocess_text(sen))

In [10]:
X[3]

In [11]:
y = train['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.20, random_state=42)

In [13]:
tokenizer = Tokenizer(num_words=5000) 
tokenizer.fit_on_texts(X_train) 
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [14]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 100 
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) 
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [15]:
X_train

In [16]:
from numpy import array 
from numpy import asarray 
from numpy import zeros 
def embeddings(path):
    embeddings_dictionary = dict() 
    embeddings_file = open(path) 
    for line in embeddings_file:
        records = line.split()
        word = records[0] 
        vector_dimensions = asarray(records[1:],dtype='float32') 
        embeddings_dictionary[word] = vector_dimensions
    embeddings_file.close()
    return embeddings_dictionary
embeddings_dictionary=embeddings("../input/glove6b100dtxt/glove.6B.100d.txt")

In [17]:
embedding_matrix = zeros((vocab_size, 100)) 
for word, index in tokenizer.word_index.items(): 
    embedding_vector = embeddings_dictionary.get(word) 
    if embedding_vector is not None: 
        embedding_matrix[index] = embedding_vector


**A-Simple model**

In [18]:
def simple_model(X_train,y_train):
    model=Sequential()
    model.add(Embedding(vocab_size,100,weights=[embedding_matrix],input_length=maxlen,trainable=False))
    model.add(Flatten())
    model.add(Dense(1,activation='sigmoid'))
    model.compile("adam",loss='binary_crossentropy',metrics=["acc"])
    history=model.fit(X_train,y_train,epochs=6,batch_size=128,verbose=1,validation_split=0.2)
    return model,history


In [19]:
model,history=simple_model(X_train,y_train)

In [20]:
score = model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [21]:
import matplotlib.pyplot as plt
def plotting(history):
    plt.plot(history.history['acc']) 
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left') 
    plt.show() 
    plt.plot(history.history['loss']) 
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss') 
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left') 
    plt.show()


In [22]:
plotting(history)

**B-CNN model**

In [23]:
def model_cnn(X_train,y_train):
    model_cnn=Sequential()

    model_cnn.add(Embedding(vocab_size, 100, weights=[embedding_matrix],input_length=maxlen))
    model_cnn.add(Conv1D(128,5,activation='relu'))
    model_cnn.add(GlobalMaxPooling1D())
    model_cnn.add(Dense(1,activation='sigmoid'))
    model_cnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    history=model_cnn.fit(X_train,y_train,epochs=6,batch_size=128,verbose=1,validation_split=0.2)
    return model_cnn,history

        

In [24]:
model_cnn,history=model_cnn(X_train,y_train)

In [25]:
score = model_cnn.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [26]:
plotting(history)

**3-RNN model**

In [27]:
def model_RNN(X_train,y_train,l,embedding_matrix):
    model_rnn=Sequential()
    model_rnn.add(Embedding(vocab_size, l, weights=[embedding_matrix],input_length=maxlen))
    model_rnn.add(LSTM(128))
    model_rnn.add(Dense(1,activation='sigmoid'))
    model_rnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    history=model_rnn.fit(X_train,y_train,epochs=6,batch_size=128,verbose=1,validation_split=0.2)
    return model_rnn,history

In [28]:
model_rnn,history=model_RNN(X_train,y_train,100,embedding_matrix)

In [29]:
score = model_rnn.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [30]:
predict_x=model_rnn.predict(X_test[7:8]) 
classes_x=np.argmax(predict_x,axis=1)
print(classes_x)

In [31]:
plotting(history)

**Classement des 3 models**
1. rnn
2. cnn
3. simple model

**RNN with GRU**

In [32]:
def model_GRU(X_train,y_train,l,embedding_matrix):
    model_rnn=Sequential()
    model_rnn.add(Embedding(vocab_size, l, weights=[embedding_matrix],input_length=maxlen))
    model_rnn.add(GRU(128))
    model_rnn.add(Dense(1,activation='sigmoid'))
    model_rnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    history=model_rnn.fit(X_train,y_train,epochs=6,batch_size=128,verbose=1,validation_split=0.2)
    return model_rnn,history

In [33]:
m,h=model_GRU(X_train,y_train,100,embedding_matrix)

In [34]:
score = m.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [35]:
plotting(h)

**RNN+CNN**



In [36]:
def model_RNNCNN(X_train,y_train,l,embedding_matrix):
    modelx = Sequential()
    modelx.add((Embedding(vocab_size,l, weights=[embedding_matrix],input_length=maxlen)))
    modelx.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    modelx.add(MaxPooling1D(pool_size=2))
    modelx.add(LSTM(128))
    modelx.add(Dense(1, activation='sigmoid'))
    modelx.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history=modelx.fit(X_train,y_train,epochs=6,batch_size=128,verbose=1,validation_split=0.2)
    return modelx,history

In [37]:
x,h=model_RNNCNN(X_train,y_train,100,embedding_matrix)

In [38]:
score = x.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

**Word2Vec**

In [39]:
word2vec=embeddings("../input/original-embeddings/GoogleNews-vectors-negative300(first500000).txt")

In [40]:
embedding_matrix1 = zeros((vocab_size, 300)) 
for word, index in tokenizer.word_index.items(): 
    embedding_vector = word2vec.get(word) 
    if embedding_vector is not None: 
        embedding_matrix1[index] = embedding_vector

In [41]:
embedding_matrix1

In [42]:
x1,his=model_RNNCNN(X_train,y_train,300,embedding_matrix1)

In [43]:
score = x1.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [44]:
x2,his=model_RNN(X_train,y_train,300,embedding_matrix1)

In [45]:
score = x2.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [46]:
x3,his=model_GRU(X_train,y_train,300,embedding_matrix1)

In [47]:
score = x3.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

**Fasttext**

In [48]:
#FASTEXT_Embedding = KeyedVectors.load_word2vec_format('../input/fast-text-word-embeddings/wiki-news-300d-1M.vec')

In [52]:
from tqdm import tqdm

import os, re, csv, math, codecs

In [53]:
embeddings_index = {}
f = codecs.open('../input/fasttext/wiki.simple.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))

In [56]:
embedding_matrix2 = zeros((vocab_size, 100)) 
for word, index in tokenizer.word_index.items(): 
    embeddings_index = embeddings_dictionary.get(word) 
    if embeddings_index is not None: 
        embedding_matrix2[index] = embeddings_index


In [57]:
embedding_matrix2

In [58]:
x11,his=model_RNNCNN(X_train,y_train,100,embedding_matrix2)

In [62]:
score = x11.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [59]:
x22,his=model_RNN(X_train,y_train,100,embedding_matrix2)

In [63]:
score = x22.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [61]:
x33,his=model_GRU(X_train,y_train,100,embedding_matrix2)

In [64]:
score = x33.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0]) 
print("Test Accuracy:", score[1])

In [72]:
import tensorflow as tf
tf.keras.utils.plot_model(x33, show_shapes=True)


In [73]:
import tensorflow as tf
tf.keras.utils.plot_model(x22, show_shapes=True)

In [74]:
import tensorflow as tf
tf.keras.utils.plot_model(x11, show_shapes=True)