In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../raw_data/clean_dataset_1.csv').drop('Unnamed: 0', axis = 1)

In [None]:
df.head()

## df round to discrete numbers

In [None]:
df['review_score'] = np.round((df['Reviewer_Score']*2)/2, decimals = 0).astype(np.int8)
df = df.drop('Reviewer_Score', axis = 1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['class1'] = df['review_score'].map({
    1:'bad',
    2:'bad',
    3:'bad',
    4:'bad',
    5:'neutral',
    6:'neutral',
    7:'neutral',
    8:'good',
    9:'good',
    10:'good',
})


df['class2'] = df['review_score'].map({
    1:0,
    2:0,
    3:0,
    4:0,
    5:0,
    6:1,
    7:1,
    8:1,
    9:1,
    10:1,
})

In [None]:
df.head()

## df drop null

In [None]:
df = df.dropna()

## remove comments les than x words

In [None]:
def word_counter(X):
    return len(X.split(' '))

In [None]:
df['word_count'] = df['reviews'].apply(word_counter)

In [None]:
df = df[df['word_count']>9].reset_index().drop('index', axis = 1)

In [None]:
df.info()

## rebalance data

In [None]:
## create a list of unique y-values
df['review_score'].unique()

counts = df['review_score'].value_counts()

df['Counts'] = df['review_score'].map(counts)

df_more_than_xxx = df[df['Counts'] > 999]

unique_values = df_more_than_xxx['review_score'].unique()

In [None]:
## create a function to rebalance the data
def balance_df(lst, df):
    dict_dfs = {}
    
    for val in lst:
        df_val = df[df['review_score'] == val]
        
        dict_dfs[val] = df_val.sample(n=1000)
        
    return dict_dfs

In [None]:
## get the df's from the dict, merge and shuffle them

## get the the dict with the dfs inside
dct_dfs = balance_df(unique_values, df_more_than_xxx)

## concat the dfs
df_balanced_sorted = pd.concat(dct_dfs.values(), ignore_index=True)

## shuffle the rows of the df
df_balanced = df_balanced_sorted.sample(frac=1).reset_index(drop=True)

df_balanced

In [None]:
df_balanced['class2'].value_counts()

In [None]:
## show a part of the rebalanced df
plt.plot(np.arange(200), df_balanced['review_score'][:200])

In [None]:
## at this point we got 9000 rows

## one hot encode y

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder()

# y_bin = encoder.fit_transform(df_balanced[['class']])

# y_bin

In [None]:
# y_bin = y_bin.toarray()

## dataset 3

In [6]:
df = pd.read_csv('../raw_data/dataset_3_clean.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,rating,review_clean,class1,class2
0,0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,0
1,1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,0
2,2,nice rooms not 4* experience hotel monaco seat...,3,nice room 4 experience hotel monaco seattle go...,0,0
3,3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,2,1
4,4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,2,1


## split the data

In [9]:
# X_train = df_balanced['reviews'][:6000]
# X_test = df_balanced['reviews'][6000:]

X_train = df['review_clean'][:17000]
X_test = df['review_clean'][17000:]

## one hot encode multiclass
# y_train = y_bin[:6000]
# y_test = y_bin[6000:]


# ## binary classification
# y_train = df_balanced['class2'][:6000]
# y_test = df_balanced['class2'][6000:]


## binary classification
y_train = df['class2'][:17000]
y_test = df['class2'][17000:]

In [10]:
df['class2'][:17000].value_counts()

0    9655
1    7345
Name: class2, dtype: int64

## vectorizing und embedding

In [11]:
# –– Step #1
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train_words = convert_sentences(X_train)
X_test_words = convert_sentences(X_test)

In [12]:
# –– Step #2
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=5)

In [13]:
# –– Step #3
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

In [14]:
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [15]:
X_train_embed = embedding(word2vec, X_train_words)
X_test_embed = embedding(word2vec, X_test_words)

## padding

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

In [17]:
X_train_pad.shape

(17000, 40, 200)

## Model

In [20]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

model = init_model()

In [21]:
# X_train_pad_short = X_train_pad[:500] # These two lines are just to accelerate the cell run
# y_train_short = y_train[:500]

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
 33/372 [=>............................] - ETA: 11s - loss: 0.6716 - accuracy: 0.5893

KeyboardInterrupt: 

In [None]:
res = model.evaluate(X_test_pad, y_test, verbose=0)
res

## Prediction

In [None]:
sentence1 = "While am generally happy with their service, there is a push to take a considerable discount for a reservation that does not allow cancellations. The standard undiscounted price is about the same as booking directly with the option of cancellations. Using Booking.com is merely a convenience. The discount for forgoing cancellation needs to be comared with the cost of cancellation insurance. Even if one can't use the reservation because of government restrictions the hotels sock it to the customer for far more than their out of pocket costs since at worst they don't have to service the rooms and at best can re-rent them. Bottom line: Don't be taken in by Booking.com's apparently cheap nonrefundable offers."
sentence2 = "Great vacation until we tried to travel home. We tried calling and waited more than 2 hours for a callback and then they were unable/unwilling to help us. Stranded for 48 hours because of this company with no help rebooking flights. We are out for hotel, food, and time off work because I was hung up on repeatedly by their customer service department. Once I was finally home, they told me there's nothing they can do for me that they were really sorry all this happened. They were unwilling to make it right, but told me that I could have requested a refund for my flight home if I would have been able to reach them at the time."

In [None]:
lst = [sentence1, sentence2]

In [None]:
## convert into tokens
tokens = convert_sentences(lst)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

## padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model.predict(vectors_padding)

prediction