In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
def get_data(nrows=1000):
    '''returns a DataFrame with nrows from downloaded Keggle csv in raw_data folder'''
    dataset_3 = pd.read_csv("../raw_data/dataset_3.csv", nrows=nrows)
    df = dataset_3.copy()
    return df

In [3]:
df = get_data()

In [4]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
995,average price hotel good location 1.5 blocks u...,3
996,good customer service recently wrote dissatisf...,4
997,ignore bad press just post review reading nega...,4
998,"business trip ok hotel fine evening business, ...",3


In [5]:
def clean_data(df):
    # Remove reviews with less than 6 words (or signs)
    df.loc[:,'length'] = df['Review'].apply(lambda x: len(word_tokenize(str(x))))
    df.drop(df[df['length'] < 6].index, inplace=True)
    df.drop(columns=['length'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [6]:
df = clean_data(df)

In [7]:
def custom_stopwords():
    """create custom stopwords list excluding negative words"""
    negative_words = ['no',
    'nor',
    'not',
    "don't",
    'should',
    "should've",
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"]

    custom_stopwords = [x for x in stopwords.words('english') if x not in negative_words]
    
    further_stopwords = ["hotel","everything","anything","thing","need","even"]
    
    custom_stopwords.extend(further_stopwords)
    
    return custom_stopwords

In [8]:
def clean_for_dl(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(word for word in text)
    

    return(text)

In [9]:
df['Review'] = df['Review'].apply(clean_for_dl)

In [10]:
df['Rating'].value_counts()

5    334
4    311
3    137
1    110
2    108
Name: Rating, dtype: int64

In [11]:
y = df["Review"]
X = df.drop("Rating", axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [13]:
# –– Step #1 split the sentence into tokens
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=5)

# –– Step #2
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)


def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        ## transforming list of vectors into one vector
        
        sum_vec = embedded_sentence.sum(axis = 0)
        
        ## put zeros when sum_vec has invalid shape
        if sum_vec.shape != (200,):
            sum_vec = np.zeros(200)
            
        embed.append(sum_vec)
        
    ## transform a list into a np-matrix
    return np.vstack(embed)


In [14]:
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)

In [15]:
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

In [16]:
X_train_pad.shape

(1, 200)

In [17]:
def init_model():

    model = Sequential()
    
    # Add Masking Layer
    model.add(layers.Masking(input_shape=(X_train_pad.shape)))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dropout(0.1))
    
    # Add Hidden Layer 2 - LSTM Layer
    model.add(layers.LSTM(15, activation='relu'))
    model.add(layers.Dropout(0.1))
    
    # Add Output Layer
    model.add(layers.Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    return model

model = init_model()
model.summary()

ValueError: Input 0 of layer lstm_1 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 20)

In [None]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

model.fit(X_train, y_train, 
          batch_size = 32,
          epochs=10,
          validation_split=0.3,
          callbacks=[es]
         )

In [None]:
res = model.evaluate(X_test_pad, y_test, verbose=0)
res