In [1]:
import pandas as pd
from sqlite3 import connect
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import batch_normalization
from tensorflow.keras.metrics import CategoricalAccuracy, AUC
from tensorflow_addons.metrics import F1Score   
import tqdm



In [2]:
conn = connect("data/dataset_review.db")
dataset = pd.read_sql("SELECT * FROM games", conn)
conn.close()

dataset.set_index("index", inplace=True)
dataset = dataset.loc[dataset["summary"].notnull()]
dataset = dataset.loc[dataset["reviews"].notnull()]
dataset = dataset[dataset["user_review"] != -1]
dataset.reset_index(drop=True, inplace=True)

In [3]:
# tfv = CountVectorizer(analyzer='word',stop_words = 'english', lowercase=True)
# tfv.fit(dataset["summary"])
# summary = tfv.transform(dataset["summary"])
# kmeans = KMeans(n_clusters=30, random_state=7).fit(summary)
# summary_kmeans = np.array(kmeans.labels_)
# np.save("data/summary_kmeans.npy", summary_kmeans)

summary_kmeans = np.load("data/summary_kmeans.npy", allow_pickle=True)

In [4]:
categorical_labels = to_categorical(summary_kmeans, num_classes=30)
dataset = pd.concat([dataset, pd.DataFrame(categorical_labels, columns=["summary_kmean_" + str(i) for i in range(30)])], axis=1)
y = pd.get_dummies((dataset["user_review"] * 10).apply(np.floor, axis=1), prefix='label')
x = dataset.drop(columns=["user_review"])

In [5]:
count_vec = CountVectorizer(analyzer='word', stop_words = 'english')
word_vecs = count_vec.fit_transform(dataset["reviews"].values).toarray()


In [6]:
def word_seperator(data, vectorizer, word_list):
    temp_list = [vectorizer.vocabulary_[word] for word in word_list]
    return data[:, temp_list]

In [7]:
glove = pd.read_csv('data/glove.840B.300d.txt', sep=" ", header=None)
glove.set_index(0, inplace=True)
glove = glove.filter(items=count_vec.get_feature_names_out(), axis=0)
word_vecs = word_seperator(data=word_vecs, vectorizer=count_vec, word_list=glove.index)

In [8]:
# sc = preprocessing.StandardScaler(with_mean=False)
# svd = decomposition.TruncatedSVD(n_components=700)
# svd_word_vecs = svd.fit_transform(sc.fit_transform(word_vecs))

x.drop(columns=["name", "summary", "reviews"], inplace=True)

x = pd.concat([x, pd.DataFrame(word_vecs.dot(glove)), pd.get_dummies(x.platform, prefix='plat')], axis=1)
x.drop(columns=["platform", "publish_year", "publish_month"], inplace=True)

In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=7, test_size=0.2, shuffle=True)

In [10]:
model = Sequential()

model.add(Dense(300, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(batch_normalization.BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(batch_normalization.BatchNormalization())


model.add(Dense(len(y.columns)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[CategoricalAccuracy(), F1Score(len(y.columns))])

In [11]:
model.fit(x_train, y=y_train, batch_size=64, 
          epochs=20, verbose=1, 
          validation_data=(x_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b10af9bdc0>