In [1]:
import os
import pandas as pd
from myutils import rule_based_classification


df = pd.read_csv("Game_data/train.csv")

df.sample(5)

Unnamed: 0,review_id,title,year,user_review,user_suggestion
4196,5592,Yu-Gi-Oh! Duel Links,2018.0,"unbelievably unbalanced, some cards are game b...",0
3054,3518,War Thunder,2017.0,---{Graphics}---☐ You forget what reality is☐ ...,1
10105,15513,theHunter Classic,2015.0,The best hunting game around.Some people feel ...,1
15431,22866,Crusaders of the Lost Idols,2017.0,Unfortunately this game has no real offline pr...,0
13645,19589,Creativerse,2016.0,"Early Access ReviewThis game is lot of fun , i...",1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17494 entries, 0 to 17493
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review_id        17494 non-null  int64  
 1   title            17494 non-null  object 
 2   year             17316 non-null  float64
 3   user_review      17494 non-null  object 
 4   user_suggestion  17494 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 683.5+ KB


In [3]:
df['user_suggestion'].value_counts()

1    9968
0    7526
Name: user_suggestion, dtype: int64

In [4]:
# CNN
from keras.models import load_model
from myutils import clean_doc
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np


MAX_SEQUENCE_LENGTH = 1000

with open('model/tokenizer_cnn.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    

X_test, y_test = [clean_doc(x) for x in df['user_review']], df["user_suggestion"]

sequences_test = tokenizer.texts_to_sequences(X_test)
X_test_word    = pad_sequences(
    sequences_test,
    maxlen  = MAX_SEQUENCE_LENGTH,
    padding = 'post'
)


test_loss = []
test_accs = []


cnn_ = load_model(f"model_cnn.h5")
score = cnn_.evaluate(X_test_word, to_categorical(y_test), verbose=1)
test_loss.append(score[0])
test_accs.append(score[1])
    
print(f"\nLoss / accuracy on testset: {np.mean(test_loss)} loss / {np.mean(test_accs)} acc")
print(f"Standard deviation: (+-{np.std(test_loss)}) loss / (+-{np.std(test_accs)}) acc")


Loss / accuracy on testset: 0.67690509557724 loss / 0.5961472392082214 acc
Standard deviation: (+-0.0) loss / (+-0.0) acc


In [5]:
# add rule_based operation
pre_resu_cnn=cnn_.predict(X_test_word, batch_size=30)
cnn_impr_result=rule_based_classification(X_test, pre_resu_cnn)
print('Accuracy improves %.2f' %cnn_impr_result)

Accuracy improves 0.01


In [6]:
# RNN
from keras.models import load_model
from myutils import clean_doc
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np

# evaluation

MAX_SEQUENCE_LENGTH = 1000

with open('model/tokenizer_rnn.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
    
X_test, y_test = [clean_doc(x) for x in df['user_review']], df["user_suggestion"]

    
sequences_test_rn = tokenizer.texts_to_sequences(X_test)
X_test_word_rn    = pad_sequences(
    sequences_test_rn,
    maxlen  = MAX_SEQUENCE_LENGTH,
    padding = 'post'
)


test_loss = []
test_accs = []


rnn_ = load_model(f"model_rnn.h5")
score = rnn_.evaluate(X_test_word_rn, to_categorical(y_test), verbose=1)
test_loss.append(score[0])
test_accs.append(score[1])
    
print(f"\nLoss / accuracy on testset: {np.mean(test_loss)} loss / {np.mean(test_accs)} acc")
print(f"Standard deviation: (+-{np.std(test_loss)}) loss / (+-{np.std(test_accs)}) acc")


Loss / accuracy on testset: 0.6911532282829285 loss / 0.5691093802452087 acc
Standard deviation: (+-0.0) loss / (+-0.0) acc


In [7]:
# add rule_based operation
pre_resu_rnn=rnn_.predict(X_test_word_rn, batch_size=30)
rnn_impr_result=rule_based_classification(X_test, pre_resu_rnn)
print("Accuracy improves %.2f" %rnn_impr_result)

Accuracy improves 0.05


In [11]:
from keras.models import load_model
from myutils import clean_doc, clean_doc_HAN
import pickle
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np
from keras.preprocessing.text import text_to_word_sequence


# evaluation

MAX_SENTS = 15
MAX_SENT_LENGTH = 100
MAX_NB_WORDS = 15000


with open('model/tokenizer_han.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
X_test, y_test = [clean_doc(x) for x in df['user_review']], df["user_suggestion"]

sequences_test_han = tokenizer.texts_to_sequences(X_test)

data_test_han = np.zeros((len(X_test), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

reviews=[]

for x in X_test:
    #print(clean_doc_HAN(x.decode()))
    reviews.append(clean_doc_HAN(x))

for i, sentences in enumerate(reviews):
    #print(sentences)
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if word in tokenizer.word_index.keys():
                    if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                        data_test_han[i,j,k] = tokenizer.word_index[word]
                        k=k+1

X_test_word_han=data_test_han

test_loss = []
test_accs = []


han_ = load_model(f"model_han_.h5")
score = han_.evaluate(X_test_word_han, to_categorical(y_test), verbose=1)
test_loss.append(score[0])
test_accs.append(score[1])
    
print(f"\nLoss / accuracy on testset: {np.mean(test_loss)} loss / {np.mean(test_accs)} acc")
print(f"Standard deviation: (+-{np.std(test_loss)}) loss / (+-{np.std(test_accs)}) acc")



Loss / accuracy on testset: 0.5670555233955383 loss / 0.7036126852035522 acc
Standard deviation: (+-0.0) loss / (+-0.0) acc


In [12]:
# add rule_based operation
pre_resu_han=han_.predict(X_test_word_han, batch_size=30)
han_impr_result=rule_based_classification(X_test, pre_resu_han)
print("Accuracy improves %.2f" %han_impr_result)

Accuracy improves 0.02
