In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
df = pd.read_csv('../raw_data/dataset_3_clean.csv').drop('Unnamed: 0', axis = 1)

In [52]:
df.columns = df.columns.str.lower()

In [53]:
df.head()

Unnamed: 0,review,rating,review_clean,class1,class2
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,1,0
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,0,0
2,nice rooms not 4* experience hotel monaco seat...,3,nice room 4 experience hotel monaco seattle go...,0,0
3,"unique, great stay, wonderful time hotel monac...",5,unique great stay wonderful time hotel monaco ...,2,1
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,2,1


In [54]:
df['class3'] = df['rating'].map({
    1:0,
    2:0,
    3:0,
    4:1,
    5:1,
})

In [55]:
df['rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: rating, dtype: int64

In [28]:
df = df[df['rating'] != 4]

In [29]:
sum(df['class3'])/df['class3'].shape[0]

0.6264618365511038

## df drop null

In [30]:
df = df.dropna()

## remove comments les than x words

In [31]:
def word_counter(X):
    return len(X.split(' '))

In [32]:
df['word_count'] = df['review'].apply(word_counter)

In [33]:
df = df[df['word_count']>9].reset_index().drop('index', axis = 1)

## split the data

In [34]:
X_train = df['review'][:5000]
X_test = df['review'][5000:7000]

y_train = df['class3'][:5000]
y_test = df['class3'][5000:7000]

In [35]:
X_train.shape

(5000,)

In [36]:
max([len(s) for s in X_train])

13501

## vectorizing und embedding

In [37]:
# –– Step #1
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train_words = convert_sentences(X_train)
X_test_words = convert_sentences(X_test)

# –– Step #2
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=15)

# –– Step #3
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# -- step 4
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed


X_train_embed = embedding(word2vec, X_train_words)
X_test_embed = embedding(word2vec, X_test_words)

## padding

In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post')

In [39]:
X_train_pad[0].shape

(20, 200)

## BaseModel

In [40]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

model = init_model()

## transfer learning

In [41]:
import gensim.downloader as api
print(list(api.info()['models'].keys()))


word2vec_wiki = api.load("glove-wiki-gigaword-50")

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [42]:
X_train_words_2 = convert_sentences(X_train)
X_test_words_2 = convert_sentences(X_test)


# –– Embed the sentences thanks to the new embedding
X_train_embed_2 = embedding(word2vec_wiki, X_train_words_2)
X_test_embed_2 = embedding(word2vec_wiki, X_test_words_2)


# –– Pad the sentences
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post')
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post')

  if word in word2vec.wv:
  embedded_sentence.append(word2vec.wv[word])


In [43]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

model.fit(X_train_pad_2, y_train, 
          batch_size = 32,
          epochs=10,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f79396b8eb0>

In [47]:
res = model.evaluate(X_test_pad_2, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

The accuracy evaluated on the test set is of 82.150%


In [48]:
sum(y_train)/len(y_train)

0.5876

In [49]:
sum(y_test)/len(y_test)

0.636

In [None]:
sentence1 = "While am generally happy with their service, there is a push to take a considerable discount for a reservation that does not allow cancellations. The standard undiscounted price is about the same as booking directly with the option of cancellations. Using Booking.com is merely a convenience. The discount for forgoing cancellation needs to be comared with the cost of cancellation insurance. Even if one can't use the reservation because of government restrictions the hotels sock it to the customer for far more than their out of pocket costs since at worst they don't have to service the rooms and at best can re-rent them. Bottom line: Don't be taken in by Booking.com's apparently cheap nonrefundable offers."
sentence2 = "Great vacation until we tried to travel home. We tried calling and waited more than 2 hours for a callback and then they were unable/unwilling to help us. Stranded for 48 hours because of this company with no help rebooking flights. We are out for hotel, food, and time off work because I was hung up on repeatedly by their customer service department. Once I was finally home, they told me there's nothing they can do for me that they were really sorry all this happened. They were unwilling to make it right, but told me that I could have requested a refund for my flight home if I would have been able to reach them at the time."

In [None]:
lst = [sentence1, sentence2]

In [None]:
## convert into tokens
tokens = convert_sentences(lst)

## convert tokens into vectors
vectors = embedding(word2vec_wiki, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model.predict(vectors_padding)

prediction

## Prediction 2

In [None]:
X_new_dataset1 = df['review'][20000:]

In [None]:
X_new_dataset1_rating = df['rating'][20000:]

In [None]:
## convert into tokens
tokens = convert_sentences(X_new_dataset1)

## convert tokens into vectors
vectors = embedding(word2vec_wiki, tokens)

# padding the vectors
vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')

## predict
prediction = model.predict(vectors_padding)

# prediction

In [None]:
pred = prediction.tolist()

In [None]:
dct_comp = {'prediction': pred, 'real_score':X_new_dataset1_rating}

In [None]:
df_comp = pd.DataFrame(dct_comp)
df_comp.head(60)