In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../raw_data/clean_dataset_1.csv').drop('Unnamed: 0', axis = 1)

In [3]:
df.head()

Unnamed: 0,Reviewer_Score,Review_Text,reviews
0,2.9,i am so angry that i made this post available...,i am so angry that i made this post available ...
1,7.5,no real complaints the hotel was great great...,no real complaint the hotel wa great great loc...
2,7.1,rooms are nice but for elderly a bit difficul...,room are nice but for elderly a bit difficult ...
3,3.8,my room was dirty and i was afraid to walk ba...,my room wa dirty and i wa afraid to walk baref...
4,6.7,you when i booked with your company on line y...,you when i booked with your company on line yo...


## df round to discrete numbers

In [4]:
df['review_score'] = np.round((df['Reviewer_Score']*2)/2, decimals = 0).astype(np.int8)
df = df.drop('Reviewer_Score', axis = 1)

In [5]:
df.head()

Unnamed: 0,Review_Text,reviews,review_score
0,i am so angry that i made this post available...,i am so angry that i made this post available ...,3
1,no real complaints the hotel was great great...,no real complaint the hotel wa great great loc...,8
2,rooms are nice but for elderly a bit difficul...,room are nice but for elderly a bit difficult ...,7
3,my room was dirty and i was afraid to walk ba...,my room wa dirty and i wa afraid to walk baref...,4
4,you when i booked with your company on line y...,you when i booked with your company on line yo...,7


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Review_Text   515738 non-null  object
 1   reviews       515493 non-null  object
 2   review_score  515738 non-null  int8  
dtypes: int8(1), object(2)
memory usage: 8.4+ MB


In [7]:
df['class1'] = df['review_score'].map({
    1:'bad',
    2:'bad',
    3:'bad',
    4:'bad',
    5:'neutral',
    6:'neutral',
    7:'neutral',
    8:'good',
    9:'good',
    10:'good',
})


df['class2'] = df['review_score'].map({
    1:0,
    2:0,
    3:0,
    4:0,
    5:0,
    6:1,
    7:1,
    8:1,
    9:1,
    10:1,
})

In [8]:
df.head()

Unnamed: 0,Review_Text,reviews,review_score,class1,class2
0,i am so angry that i made this post available...,i am so angry that i made this post available ...,3,bad,0
1,no real complaints the hotel was great great...,no real complaint the hotel wa great great loc...,8,good,1
2,rooms are nice but for elderly a bit difficul...,room are nice but for elderly a bit difficult ...,7,neutral,1
3,my room was dirty and i was afraid to walk ba...,my room wa dirty and i wa afraid to walk baref...,4,bad,0
4,you when i booked with your company on line y...,you when i booked with your company on line yo...,7,neutral,1


## df drop null

In [9]:
df = df.dropna()

## remove comments les than x words

In [10]:
def word_counter(X):
    return len(X.split(' '))

In [11]:
df['word_count'] = df['reviews'].apply(word_counter)

In [12]:
df = df[df['word_count']>9].reset_index().drop('index', axis = 1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400486 entries, 0 to 400485
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Review_Text   400486 non-null  object
 1   reviews       400486 non-null  object
 2   review_score  400486 non-null  int8  
 3   class1        400486 non-null  object
 4   class2        400486 non-null  int64 
 5   word_count    400486 non-null  int64 
dtypes: int64(2), int8(1), object(3)
memory usage: 15.7+ MB


## rebalance data

In [None]:
## create a list of unique y-values
df['review_score'].unique()

counts = df['review_score'].value_counts()

df['Counts'] = df['review_score'].map(counts)

df_more_than_xxx = df[df['Counts'] > 999]

unique_values = df_more_than_xxx['review_score'].unique()

In [None]:
## create a function to rebalance the data
def balance_df(lst, df):
    dict_dfs = {}
    
    for val in lst:
        df_val = df[df['review_score'] == val]
        
        dict_dfs[val] = df_val.sample(n=1000)
        
    return dict_dfs

In [None]:
## get the df's from the dict, merge and shuffle them

## get the the dict with the dfs inside
dct_dfs = balance_df(unique_values, df_more_than_xxx)

## concat the dfs
df_balanced_sorted = pd.concat(dct_dfs.values(), ignore_index=True)

## shuffle the rows of the df
df_balanced = df_balanced_sorted.sample(frac=1).reset_index(drop=True)

df_balanced

In [None]:
df_balanced['class2'].value_counts()

In [None]:
## show a part of the rebalanced df
plt.plot(np.arange(200), df_balanced['review_score'][:200])

In [None]:
## at this point we got 9000 rows

## one hot encode y

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder()

# y_bin = encoder.fit_transform(df_balanced[['class']])

# y_bin

In [None]:
# y_bin = y_bin.toarray()

## dataset 3

In [None]:
# df = pd.read_csv('../raw_data/dataset_3_clean.csv')

In [None]:
# df.head()

## split the data

In [14]:
# X_train = df_balanced['reviews'][:6000]
# X_test = df_balanced['reviews'][6000:]

X_train = df['reviews'][:60000]
X_test = df['reviews'][60000:100000]

## one hot encode multiclass
# y_train = y_bin[:6000]
# y_test = y_bin[6000:]


# ## binary classification
# y_train = df_balanced['class2'][:6000]
# y_test = df_balanced['class2'][6000:]


## binary classification
y_train = df['review_score'][:60000]
y_test = df['review_score'][60000:100000]

In [26]:
X_train.shape

(60000,)

In [27]:
X_train[0]

'i am so angry that i made this post available via all possible site i use when planing my trip so no one will make the mistake of booking this place i made my booking via booking com we stayed for night in this hotel from to july upon arrival we were placed in a small room on the nd floor of the hotel it turned out that this wa not the room we booked i had specially reserved the level duplex room so that we would have a big window and high ceiling the room itself wa ok if you don t mind the broken window that can not be closed hello rain and a mini fridge that contained some sort of a bio weapon at least i guessed so by the smell of it i intimately asked to change the room and after explaining time that i booked a duplex btw it cost the same a a simple double but got way more volume due to the high ceiling wa offered a room but only the next day so i had to check out the next day before o clock in order to get the room i waned to not the best way to begin your holiday so we had to wai

## vectorizing und embedding

In [82]:
# –– Step #1 split the sentence into tokens
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train_words = convert_sentences(X_train)
X_test_words = convert_sentences(X_test)


# –– Step #2
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=5)


# –– Step #3
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)


def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        ## transforming list of vectors into one vector
        
        sum_vec = embedded_sentence.sum(axis = 0)
        
        ## put zeros when sum_vec has invalid shape
        if sum_vec.shape != (200,):
            sum_vec = np.zeros(200)
            
        embed.append(sum_vec)
        
    ## transform a list into a np-matrix
    return np.vstack(embed)


X_train_vstack = embedding(word2vec, X_train_words)
X_test_vstack = embedding(word2vec, X_test_words)

## padding

In [39]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
# X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

## Model

In [95]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, metrics


def init_model():
    model = Sequential()

    model.add(layers.Dense(32, activation='relu', input_dim = 200))
#     model.add(layers.Flatten())
    model.add(layers.Dense(50, activation='relu'))

    model.add(layers.Dense(20, activation='relu'))
    
    
    model.add(layers.Dense(10, activation='relu'))

    ## output layer
    model.add(layers.Dense(1, activation='linear'))


    model.compile(loss='mse',
                  optimizer='adam',
                  metrics=['mae', metrics.RootMeanSquaredError()])
    
    return model

model = init_model()

In [96]:
# X_train_pad_short = X_train_pad[:500] # These two lines are just to accelerate the cell run
# y_train_short = y_train[:500]

from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(X_train_vstack, y_train, 
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [97]:
res = model.evaluate(X_test_vstack, y_test, verbose=0)
res

[3.1745622158050537, 1.4594547748565674, 1.7817301750183105]

## Prediction

In [79]:
sentence1 = "While am generally happy with their service, there is a push to take a considerable discount for a reservation that does not allow cancellations. The standard undiscounted price is about the same as booking directly with the option of cancellations. Using Booking.com is merely a convenience. The discount for forgoing cancellation needs to be comared with the cost of cancellation insurance. Even if one can't use the reservation because of government restrictions the hotels sock it to the customer for far more than their out of pocket costs since at worst they don't have to service the rooms and at best can re-rent them. Bottom line: Don't be taken in by Booking.com's apparently cheap nonrefundable offers."
sentence2 = "Great vacation until we tried to travel home. We tried calling and waited more than 2 hours for a callback and then they were unable/unwilling to help us. Stranded for 48 hours because of this company with no help rebooking flights. We are out for hotel, food, and time off work because I was hung up on repeatedly by their customer service department. Once I was finally home, they told me there's nothing they can do for me that they were really sorry all this happened. They were unwilling to make it right, but told me that I could have requested a refund for my flight home if I would have been able to reach them at the time."

In [80]:
lst = [sentence1, sentence2]

In [83]:
## convert into tokens
tokens = convert_sentences(lst)

## convert tokens into vectors
vectors = embedding(word2vec, tokens)

## padding the vectors
# vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model.predict(vectors)

prediction

array([[8.970053],
       [8.771551]], dtype=float32)