In [1]:
import pandas as pd
import numpy as np
import re
import string
from tensorflow import keras

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop=set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.optimizers import Adam
from keras.initializers import Constant

from tensorflow.keras.layers import Embedding, LSTM, Dense,SpatialDropout1D

In [2]:
rest_train = pd.read_csv("./Datasets/yelp_review_polarity_csv/fixed_train.csv")
rest_test = pd.read_csv("./Datasets/yelp_review_polarity_csv/fixed_test.csv")
rest_df=pd.concat([rest_train,rest_test])

In [3]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [4]:
rest_df['review']=rest_df['review'].apply(lambda x : remove_URL(x))
rest_df['review']=rest_df['review'].apply(lambda x : remove_html(x))
rest_df['review']=rest_df['review'].apply(lambda x : remove_emoji(x))
rest_df['review']=rest_df['review'].apply(lambda x : remove_punct(x))

In [5]:
embedding_dict={}
MAX_LEN=50
learning_rate=1e-5
batch_size = 64
epochs = 10

In [6]:
def create_corpus(df):
    corpus=[]
    for tweet in df['review']:
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus
corpus=create_corpus(rest_df)

In [7]:
with open('./Datasets/glove.6B.100d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [8]:
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

rest_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
word_index=tokenizer_obj.word_index

In [9]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in word_index.items():
    if i > num_words:
        continue  
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

In [10]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

optimzer=Adam(learning_rate=learning_rate)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [11]:
train = rest_pad[:rest_train.shape[0]]
test = rest_pad[rest_train.shape[0]:]

In [12]:
X_train,X_test,y_train,y_test = train_test_split(train,rest_train['target'].values,test_size=0.20)

In [13]:
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test,y_test), verbose=2)

Epoch 1/10
7000/7000 - 140s - loss: 0.6076 - accuracy: 0.6608 - val_loss: 0.5024 - val_accuracy: 0.7603
Epoch 2/10
7000/7000 - 138s - loss: 0.5225 - accuracy: 0.7415 - val_loss: 0.4772 - val_accuracy: 0.7757
Epoch 3/10
7000/7000 - 145s - loss: 0.5032 - accuracy: 0.7541 - val_loss: 0.4556 - val_accuracy: 0.7869
Epoch 4/10
7000/7000 - 148s - loss: 0.4893 - accuracy: 0.7621 - val_loss: 0.4434 - val_accuracy: 0.7940
Epoch 5/10


KeyboardInterrupt: 

In [None]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(len(y_pre))

path = pd.read_csv("./Datasets/yelp_review_polarity_csv/fixed_test.csv")
original = path
original.to_csv('./Output/Predicted_dataset.csv')
predicted = pd.read_csv('./Output/Predicted_dataset.csv')
predicted['target'] = y_pre
predicted.to_csv('./Output/GloVe_Predicted_dataset.csv', index=False)