In [1]:
import pandas as pd
from sqlite3 import connect
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.normalization import batch_normalization
import tensorflow
from tensorflow_addons.metrics import RSquare
from sklearn.preprocessing import StandardScaler


In [2]:
def word_seperator(data, vectorizer, word_list):
    temp_list = [vectorizer.vocabulary_[word] for word in word_list.values]
    return data[:, temp_list]

In [3]:
conn = connect("data/dataset_review.db")
dataset = pd.read_sql("SELECT * FROM games", conn)
conn.close()

dataset.set_index("index", inplace=True)
dataset = dataset.loc[dataset["summary"].notnull()]
dataset = dataset.loc[dataset["reviews"].notnull()]
dataset = dataset[dataset["user_review"] != -1]
dataset.reset_index(drop=True, inplace=True)

In [4]:
# tfv = CountVectorizer(analyzer='word',stop_words = 'english', lowercase=True)
# tfv.fit(dataset["summary"])
# summary = tfv.transform(dataset["summary"]).toarray()

# glove = pd.read_csv('data/glove.840B.300d.txt', sep=" ", header=None)
# glove.set_index(0, inplace=True)
# glove = glove.filter(items=tfv.get_feature_names_out(), axis=0)
# summary = word_seperator(data=summary, vectorizer=tfv, word_list=glove.index)


# kmeans = KMeans(n_clusters=30, random_state=7).fit(pd.DataFrame(summary.dot(glove)))
# del glove

# summary_kmeans = np.array(kmeans.labels_)
# del kmeans
# np.save("data/summary_kmeans.npy", summary_kmeans)

summary_kmeans = np.load("data/summary_kmeans.npy", allow_pickle=True)

In [5]:
categorical_labels = to_categorical(summary_kmeans, num_classes=30)
dataset = pd.concat([dataset, pd.DataFrame(categorical_labels, columns=["summary_kmean_" + str(i) for i in range(30)])], axis=1)
y = dataset["user_review"]
x = dataset.drop(columns=["user_review"])

In [6]:
# count_vec = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', stop_words = 'english')
# word_vecs = count_vec.fit_transform(dataset["reviews"].values).toarray()
# glove = pd.read_csv('data/glove.840B.300d.txt', sep=" ", header=None)
# glove.set_index(0, inplace=True)
# glove = glove.filter(items=count_vec.get_feature_names_out(), axis=0)
# word_vecs = word_seperator(data=word_vecs, vectorizer=count_vec, word_list=glove.index)

# reviews = word_vecs.dot(glove)
# np.save("data/reviews_glove.npy", reviews)

reviews = np.load("data/reviews_glove.npy", allow_pickle=True)

In [7]:
x.drop(columns=["name", "summary", "reviews"], inplace=True)

x = pd.concat([x, pd.DataFrame(reviews), pd.get_dummies(x.platform, prefix='plat')], axis=1)
x.drop(columns=["platform", "publish_year", "publish_month"], inplace=True)

x = StandardScaler(with_mean=False).fit_transform(x)

svd = decomposition.TruncatedSVD(n_components=200, n_iter=7, random_state=7)
x = svd.fit_transform(x)



In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=7, test_size=0.2, shuffle=True)

In [9]:
model = Sequential()

model.add(Dense(300, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(batch_normalization.BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.2))
model.add(batch_normalization.BatchNormalization())


model.add(Dense(1))
model.add(Activation('relu'))

model.compile(loss="mean_squared_logarithmic_error", optimizer='adam', metrics=[tensorflow.keras.metrics.RootMeanSquaredError(), RSquare(dtype=tensorflow.float32, y_shape=(1,))])

In [10]:
model.fit(x_train, y=y_train, batch_size=64, 
          epochs=20, verbose=1, 
          validation_data=(x_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c58672a280>

In [11]:
tuple(zip(model.predict(x_valid), y_valid))

((array([0.68794715], dtype=float32), 0.82),
 (array([0.7597153], dtype=float32), 0.8),
 (array([0.75089777], dtype=float32), 0.74),
 (array([0.6061145], dtype=float32), 0.7),
 (array([0.78449225], dtype=float32), 0.85),
 (array([0.74082863], dtype=float32), 0.78),
 (array([0.6687566], dtype=float32), 0.75),
 (array([0.75020576], dtype=float32), 0.7),
 (array([0.7221638], dtype=float32), 0.8099999999999999),
 (array([0.70429975], dtype=float32), 0.6599999999999999),
 (array([0.60890174], dtype=float32), 0.64),
 (array([0.5999727], dtype=float32), 0.62),
 (array([0.7467593], dtype=float32), 0.6900000000000001),
 (array([0.7750497], dtype=float32), 0.86),
 (array([0.81415206], dtype=float32), 0.75),
 (array([0.7862397], dtype=float32), 0.8),
 (array([0.8271906], dtype=float32), 0.89),
 (array([0.67150855], dtype=float32), 0.67),
 (array([0.66694707], dtype=float32), 0.7),
 (array([0.8285606], dtype=float32), 0.77),
 (array([0.6939799], dtype=float32), 0.41),
 (array([0.76684487], dtype=f