In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('../raw_data/clean_dataset_1.csv').drop('Unnamed: 0', axis = 1)

In [4]:
df.columns = df.columns.str.lower()

## df drop null

In [5]:
df = df.dropna()

## remove comments les than x words

In [6]:
def word_counter(X):
    return len(X.split(' '))

In [7]:
df['word_count'] = df['reviews'].apply(word_counter)

In [8]:
df = df[df['word_count']>9].reset_index().drop('index', axis = 1)

In [98]:
df['reviewer_score'].value_counts()

10.0    83043
9.6     54462
9.2     45878
8.8     36744
8.3     32612
7.5     26691
7.9     25785
7.1     20052
6.7     15658
6.3     12556
5.8     10281
5.4      8058
5.0      7114
4.6      5343
4.2      4349
3.8      3468
3.3      2296
2.5      1660
2.9      1320
9.0       492
9.5       491
8.0       378
8.5       378
7.0       300
6.5       298
6.0       204
5.5       196
4.5        93
4.0        72
3.5        55
9.4        46
8.1        30
3.0        30
6.9        24
5.6        12
4.4        10
3.1         7
Name: reviewer_score, dtype: int64

## split the data

In [10]:
X_train = df['reviews'][:60000]
X_test = df['reviews'][60000:100000]

y_train = df['reviewer_score'][:60000]
y_test = df['reviewer_score'][60000:100000]

In [11]:
X_train.shape

(60000,)

In [77]:
max([len(s) for s in X_train])

2955

## vectorizing und embedding

In [78]:
# –– Step #1
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train_words = convert_sentences(X_train)
X_test_words = convert_sentences(X_test)

# –– Step #2
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=5)

# –– Step #3
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# -- step 4
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed


X_train_embed = embedding(word2vec, X_train_words)
X_test_embed = embedding(word2vec, X_test_words)

## padding

In [79]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

In [60]:
X_train_pad[0].shape

(68, 200)

## BaseModel

In [80]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, metrics


def init_model():
    model = Sequential()

    model.add(layers.LSTM(20, return_sequences=True, activation='tanh'))


    model.add(layers.Dense(40, activation='relu'))
    
    
    model.add(layers.Dense(20, activation='relu'))

    ## output layer
    model.add(layers.Dense(1, activation='linear'))


    model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['mae', metrics.RootMeanSquaredError()])
    
    return model

model = init_model()

In [81]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, metrics

def init_model_2():
    
    model = Sequential()
    model.add(layers.LSTM(20, return_sequences=True, activation='tanh'))
    
    ## regularisation layer
    model.add(layers.LSTM(10, activation='tanh'))
    
    ## regularisation layer
    model.add(layers.Dense(5, activation='relu'))
    
    ## regularisation layer
    model.add(layers.Dense(1, activation='linear'))
    
    
    
    model.compile(loss='mse', 
                  optimizer='rmsprop', 
                  metrics=['mae'])
    
    return model

model2 = init_model_2()

In [82]:
# X_train_pad_short = X_train_pad[:500] # These two lines are just to accelerate the cell run
# y_train_short = y_train[:500]

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=10, restore_best_weights=True)

history = model2.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [83]:
res = model2.evaluate(X_test_pad, y_test, verbose=0)
res

[3.149195671081543, 1.5070407390594482]

## Prediction 1

In [84]:
sentence1 = "While am generally happy with their service, there is a push to take a considerable discount for a reservation that does not allow cancellations. The standard undiscounted price is about the same as booking directly with the option of cancellations. Using Booking.com is merely a convenience. The discount for forgoing cancellation needs to be comared with the cost of cancellation insurance. Even if one can't use the reservation because of government restrictions the hotels sock it to the customer for far more than their out of pocket costs since at worst they don't have to service the rooms and at best can re-rent them. Bottom line: Don't be taken in by Booking.com's apparently cheap nonrefundable offers."
sentence2 = "Great vacation until we tried to travel home. We tried calling and waited more than 2 hours for a callback and then they were unable/unwilling to help us. Stranded for 48 hours because of this company with no help rebooking flights. We are out for hotel, food, and time off work because I was hung up on repeatedly by their customer service department. Once I was finally home, they told me there's nothing they can do for me that they were really sorry all this happened. They were unwilling to make it right, but told me that I could have requested a refund for my flight home if I would have been able to reach them at the time."

In [85]:
lst = [sentence1, sentence2]

In [86]:
## convert into tokens
tokens = convert_sentences(lst)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model2.predict(vectors_padding)

prediction



array([[7.786937 ],
       [7.7911944]], dtype=float32)

## Prediction 2

In [87]:
X_new_dataset1 = df['reviews'][100000:100100]

In [88]:
X_new_dataset1_rating = df['reviewer_score'][100000:100100]

In [89]:
## convert into tokens
tokens = convert_sentences(X_new_dataset1)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model2.predict(vectors_padding)

# prediction



In [90]:
pred = prediction.tolist()

In [91]:
dct_comp = {'prediction': pred, 'real_score':X_new_dataset1_rating}

In [97]:
df_comp = pd.DataFrame(dct_comp)
df_comp.head(60)

Unnamed: 0,prediction,real_score
100000,[7.791843414306641],6.3
100001,[7.79453182220459],6.3
100002,[7.79453182220459],9.2
100003,[7.79453182220459],9.6
100004,[7.79453182220459],7.1
100005,[7.79453182220459],6.7
100006,[7.7932353019714355],8.3
100007,[7.79453182220459],8.8
100008,[7.79453182220459],8.3
100009,[7.79453182220459],10.0
