In [1]:


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.layers import Embedding,LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Flatten
from sklearn.model_selection import train_test_split

In [3]:
movie_reviews=pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(movie_reviews.isnull().values.any())
movie_reviews.shape

In [4]:
movie_reviews['review'][4]

In [5]:
import seaborn as s
s.countplot(x='sentiment',data=movie_reviews)

In [6]:
def preprocessing_text(sen):
    sentence=remove_tags(sen)
    sentence=re.sub('[^a-zA-Z]',' ',sentence)

    sentence=re.sub(r'\s+[a-zA-Z]\s+',' ',sentence)

    sentence=re.sub(r'\s+',' ',sentence)
    return sentence

In [7]:
TAG_RE=re.compile(r'<[^>]+>')
def remove_tags(sen):
    return TAG_RE.sub(' ',sen)

In [8]:
review=[]
sentences=list(movie_reviews['review'])
for sen in sentences:
    review.append(preprocessing_text(sen))

In [9]:
review[4]

In [10]:
y=np.array([1 if x=='positive' else 0 for x in movie_reviews['sentiment']])

In [11]:
X_train,X_test,y_train,y_test=train_test_split(review,y,test_size=0.2,random_state=40)

In [12]:
tokenizer=Tokenizer(5000)
tokenizer.fit_on_texts(X_train)

X_train=tokenizer.texts_to_sequences(X_train)
X_test=tokenizer.texts_to_sequences(X_test)

In [13]:
vocab_size=len(tokenizer.word_index)+1
maxlen=100
X_train=pad_sequences(X_train,padding='post',maxlen=maxlen)
X_test=pad_sequences(X_test,padding='post',maxlen=maxlen)

In [14]:
embeddings_dict=dict()
glove=open('../input/glove6b100dtxt/glove.6B.100d.txt',encoding='utf8')
for line in glove:
    records=line.split()
    word=records[0]
    vector_dim=np.asarray(records[1:],dtype='float32')
    embeddings_dict[word]=vector_dim
glove.close()

In [15]:
embedding_matrix=np.zeros((vocab_size,100))
for word , index in tokenizer.word_index.items():
    embedding_vec=embeddings_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[index]=embedding_vec

In [16]:
model=Sequential([
    Embedding(vocab_size,100,weights=[embedding_matrix],input_length=maxlen,trainable=False),
    LSTM(128),
    Dense(1,activation='sigmoid')
])
model.summary()


In [19]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [20]:
hist=model.fit(X_train,y_train,batch_size=64,epochs=10,validation_data=(X_test,y_test),verbose=1)

In [21]:
model.evaluate(X_test,y_test)

In [22]:
inst=review[60]
inst

In [23]:
inst=tokenizer.texts_to_sequences(inst)
flat_list=[]
for sublist in inst:
    for item in sublist:
        flat_list.append(item)
flat_list=[flat_list]
inst=pad_sequences(flat_list,padding='post',maxlen=maxlen)

In [27]:
print(f'real value: {y[60]}\n predicted value')
model.predict(inst)

In [25]:
import matplotlib.pyplot as plt
def plot_graphs(history,string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [28]:
plot_graphs(hist,'acc')
plot_graphs(hist,'loss')