In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../raw_data/dataset_3_clean.csv').drop('Unnamed: 0', axis = 1)

In [3]:
df.columns = df.columns.str.lower()

In [4]:
df.head()

Unnamed: 0,review,rating,review_clean,class1,class2
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,0
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,0
2,nice rooms not 4* experience hotel monaco seat...,3,nice room 4 experience hotel monaco seattle go...,0,0
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,2,1
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,2,1


## df drop null

In [5]:
df = df.dropna()

## remove comments les than x words

In [6]:
def word_counter(X):
    return len(X.split(' '))

In [7]:
df['word_count'] = df['review_clean'].apply(word_counter)

In [8]:
df = df[df['word_count']>9].reset_index().drop('index', axis = 1)

## split the data

In [9]:
X_train = df['review_clean'][:12000]
X_test = df['review_clean'][12000:20000]

y_train = df['rating'][:12000]
y_test = df['rating'][12000:20000]

In [10]:
X_train.shape

(12000,)

In [11]:
max([len(s) for s in X_train])

12720

## vectorizing und embedding

In [19]:
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


# –– Step #1
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train_words = convert_sentences(X_train)
X_test_words = convert_sentences(X_test)


# –– Step #2
word2vec = Word2Vec(sentences=X_train, min_count=10, window=10)


# –– Step #3
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

X_train_embed = embedding(word2vec, X_train_words)
X_test_embed = embedding(word2vec, X_test_words)


# –– Step #4
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

## padding

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

In [21]:
X_train_pad.shape

(12000, 36, 100)

## BaseModel

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, metrics


def init_model():
    model = Sequential()

    model.add(layers.LSTM(20, return_sequences=True, activation='tanh'))


    model.add(layers.Dense(40, activation='relu'))
    
    
    model.add(layers.Dense(20, activation='relu'))

    ## output layer
    model.add(layers.Dense(1, activation='linear'))


    model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['mae', metrics.RootMeanSquaredError()])
    
    return model

model = init_model()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, metrics

def init_model_2():
    
    model = Sequential()
    model.add(layers.LSTM(20, return_sequences=True, activation='tanh'))
#     model.add(layers.LSTM(10, return_sequences=True, activation='tanh'))
    model.add(layers.LSTM(10, activation='tanh'))
    model.add(layers.Dense(5, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(loss='mse', 
                  optimizer='rmsprop', 
                  metrics=['mae'])
    
    return model

model2 = init_model_2()

In [None]:
# X_train_pad_short = X_train_pad[:500] # These two lines are just to accelerate the cell run
# y_train_short = y_train[:500]

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

history = model2.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

In [None]:
res = model2.evaluate(X_test_pad, y_test, verbose=0)
res

## Prediction 1

In [None]:
sentence1 = "While am generally happy with their service, there is a push to take a considerable discount for a reservation that does not allow cancellations. The standard undiscounted price is about the same as booking directly with the option of cancellations. Using Booking.com is merely a convenience. The discount for forgoing cancellation needs to be comared with the cost of cancellation insurance. Even if one can't use the reservation because of government restrictions the hotels sock it to the customer for far more than their out of pocket costs since at worst they don't have to service the rooms and at best can re-rent them. Bottom line: Don't be taken in by Booking.com's apparently cheap nonrefundable offers."
sentence2 = "Great vacation until we tried to travel home. We tried calling and waited more than 2 hours for a callback and then they were unable/unwilling to help us. Stranded for 48 hours because of this company with no help rebooking flights. We are out for hotel, food, and time off work because I was hung up on repeatedly by their customer service department. Once I was finally home, they told me there's nothing they can do for me that they were really sorry all this happened. They were unwilling to make it right, but told me that I could have requested a refund for my flight home if I would have been able to reach them at the time."

In [None]:
lst = [sentence1, sentence2]

In [None]:
## convert into tokens
tokens = convert_sentences(lst)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model2.predict(vectors_padding)

prediction

## Prediction 2

In [None]:
X_new_dataset1 = df['review'][20000:]

In [None]:
X_new_dataset1_rating = df['rating'][20000:]

In [None]:
## convert into tokens
tokens = convert_sentences(X_new_dataset1)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model2.predict(vectors_padding)

# prediction

In [None]:
pred = prediction.tolist()

In [None]:
dct_comp = {'prediction': pred, 'real_score':X_new_dataset1_rating}

In [None]:
df_comp = pd.DataFrame(dct_comp)
df_comp.head(60)