Bima Surya Nurwahid

Dataset : https://www.kaggle.com/datasets/ramadhaniduma/tripadvisor-hotel-reviews

In [92]:
!kaggle datasets download -d ramadhaniduma/tripadvisor-hotel-reviews

Downloading tripadvisor-hotel-reviews.zip to /content
  0% 0.00/5.14M [00:00<?, ?B/s] 97% 5.00M/5.14M [00:00<00:00, 52.3MB/s]
100% 5.14M/5.14M [00:00<00:00, 53.4MB/s]


In [93]:
import zipfile

zip_ref = zipfile.ZipFile("/content/tripadvisor-hotel-reviews.zip", 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [101]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import nltk as nltk
from nltk import corpus
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [97]:
missing_values = ["n/a", "na", "--"]
df = pd.read_csv('/content/tripadvisor_hotel_reviews.csv', encoding='latin-1')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [98]:
df.isna().sum()

Review    0
Rating    0
dtype: int64

In [99]:
def tokenize(text):
    split=re.split("\W+",text) 
    return split
df['review_text_split']=df['Review'].apply(lambda x: tokenize(x.lower()))
df.head(100)

Unnamed: 0,Review,Rating,review_text_split
0,nice hotel expensive parking got good deal sta...,4,"[nice, hotel, expensive, parking, got, good, d..."
1,ok nothing special charge diamond member hilto...,2,"[ok, nothing, special, charge, diamond, member..."
2,nice rooms not 4* experience hotel monaco seat...,3,"[nice, rooms, not, 4, experience, hotel, monac..."
3,"unique, great stay, wonderful time hotel monac...",5,"[unique, great, stay, wonderful, time, hotel, ..."
4,"great stay great stay, went seahawk game aweso...",5,"[great, stay, great, stay, went, seahawk, game..."
...,...,...,...
95,"excellent stay staff friendly helpful, nice ho...",4,"[excellent, stay, staff, friendly, helpful, ni..."
96,return going seattle booked hotel knowing budg...,4,"[return, going, seattle, booked, hotel, knowin..."
97,terrible hotel approximately 2 weeks ago april...,1,"[terrible, hotel, approximately, 2, weeks, ago..."
98,great price okay experience stayed inn queen a...,3,"[great, price, okay, experience, stayed, inn, ..."


In [102]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')
print(stopword[:20])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [103]:
def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text
df['review_text_stopwords'] = df['review_text_split'].apply(lambda x: remove_stopwords(x))
df.head(1000)

Unnamed: 0,Review,Rating,review_text_split,review_text_stopwords
0,nice hotel expensive parking got good deal sta...,4,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d..."
1,ok nothing special charge diamond member hilto...,2,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member..."
2,nice rooms not 4* experience hotel monaco seat...,3,"[nice, rooms, not, 4, experience, hotel, monac...","[nice, rooms, 4, experience, hotel, monaco, se..."
3,"unique, great stay, wonderful time hotel monac...",5,"[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ..."
4,"great stay great stay, went seahawk game aweso...",5,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."
...,...,...,...,...
995,average price hotel good location 1.5 blocks u...,3,"[average, price, hotel, good, location, 1, 5, ...","[average, price, hotel, good, location, 1, 5, ..."
996,good customer service recently wrote dissatisf...,4,"[good, customer, service, recently, wrote, dis...","[good, customer, service, recently, wrote, dis..."
997,ignore bad press just post review reading nega...,4,"[ignore, bad, press, just, post, review, readi...","[ignore, bad, press, post, review, reading, ne..."
998,"business trip ok hotel fine evening business, ...",3,"[business, trip, ok, hotel, fine, evening, bus...","[business, trip, ok, hotel, fine, evening, bus..."


In [104]:
stemmer = nltk.stem.SnowballStemmer(language='english')
def stem_list(row):
    my_list = row['review_text_stopwords']
    stemmed_list = [stemmer.stem(word) for word in my_list]
    return (stemmed_list)

df['stemmed_review'] = df.apply(stem_list, axis=1)

In [106]:
def rating_group(rate):
    group = rate['Rating']
    if rate['Rating'] > 3:
        grouped_rate = 'Good'
    elif rate['Rating'] == 3:
        grouped_rate = 'Neutral'
    elif rate['Rating'] < 3:
        grouped_rate = 'Bad'
    return (grouped_rate)

df['new_rating'] = df.apply(rating_group, axis=1)

In [107]:
rating_category = pd.get_dummies(df.new_rating)
df_new = pd.concat([df, rating_category], axis=1)
df_new = df_new.drop(columns='Rating')
df_new

Unnamed: 0,Review,review_text_split,review_text_stopwords,stemmed_review,new_rating,Bad,Good,Neutral
0,nice hotel expensive parking got good deal sta...,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expens, park, got, good, deal, s...",Good,0,1,0
1,ok nothing special charge diamond member hilto...,"[ok, nothing, special, charge, diamond, member...","[ok, nothing, special, charge, diamond, member...","[ok, noth, special, charg, diamond, member, hi...",Bad,1,0,0
2,nice rooms not 4* experience hotel monaco seat...,"[nice, rooms, not, 4, experience, hotel, monac...","[nice, rooms, 4, experience, hotel, monaco, se...","[nice, room, 4, experi, hotel, monaco, seattl,...",Neutral,0,0,1
3,"unique, great stay, wonderful time hotel monac...","[unique, great, stay, wonderful, time, hotel, ...","[unique, great, stay, wonderful, time, hotel, ...","[uniqu, great, stay, wonder, time, hotel, mona...",Good,0,1,0
4,"great stay great stay, went seahawk game aweso...","[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game...",Good,0,1,0
...,...,...,...,...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...","[best, kept, secret, 3rd, time, staying, charm...","[best, kept, secret, 3rd, time, staying, charm...","[best, kept, secret, 3rd, time, stay, charm, 5...",Good,0,1,0
20487,great location price view hotel great quick pl...,"[great, location, price, view, hotel, great, q...","[great, location, price, view, hotel, great, q...","[great, locat, price, view, hotel, great, quic...",Good,0,1,0
20488,"ok just looks nice modern outside, desk staff ...","[ok, just, looks, nice, modern, outside, desk,...","[ok, looks, nice, modern, outside, desk, staff...","[ok, look, nice, modern, outsid, desk, staff, ...",Bad,1,0,0
20489,hotel theft ruined vacation hotel opened sept ...,"[hotel, theft, ruined, vacation, hotel, opened...","[hotel, theft, ruined, vacation, hotel, opened...","[hotel, theft, ruin, vacat, hotel, open, sept,...",Bad,1,0,0


In [108]:
review = df_new['stemmed_review'].values
label = df_new[['Bad','Good','Neutral']].values
label

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       ...,
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]], dtype=uint8)

In [109]:
print(review.shape, label.shape)

(20491,) (20491, 3)


In [116]:
x_train,x_test,y_train,y_test = train_test_split(review, label, test_size = 0.2,shuffle=True)

In [117]:
tokenizer = Tokenizer(num_words=10000, oov_token='x')
tokenizer.fit_on_texts(x_train)
tokenizer.fit_on_texts(x_test)

sekuens_train = tokenizer.texts_to_sequences(x_train)
sekuens_test = tokenizer.texts_to_sequences(x_test)
 
padded_train = pad_sequences(sekuens_train)
padded_test = pad_sequences(sekuens_test)

print(padded_test.shape)

(4099, 1858)


In [120]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=50000, output_dim=16),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')
     ])

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, None, 16)          800000    
                                                                 
 batch_normalization_2 (Batc  (None, None, 16)         64        
 hNormalization)                                                 
                                                                 
 dropout_22 (Dropout)        (None, None, 16)          0         
                                                                 
 lstm_13 (LSTM)              (None, 128)               74240     
                                                                 
 dense_33 (Dense)            (None, 128)               16512     
                                                                 
 dropout_23 (Dropout)        (None, 128)               0         
                                                     

In [125]:
class nlpCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.8 and logs.get('val_accuracy')>0.8):
      self.model.stop_training = True
      print("\nAkurasi train telah mencapai nilai > 90%!")

callbacks = nlpCallback()

In [127]:
epoch=100
history = model.fit(padded_train, 
                    y_train,
                    batch_size=32, 
                    epochs=epoch,
                    validation_data=(padded_test, y_test), 
                    verbose=2,
                    callbacks=[callbacks]
                    )

Epoch 1/100

Akurasi train telah mencapai nilai > 90%!
513/513 - 694s - loss: 0.2823 - accuracy: 0.8973 - val_loss: 0.7479 - val_accuracy: 0.8404 - 694s/epoch - 1s/step
