In [1]:
import pandas as pd
from sqlite3 import connect
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split


In [2]:
conn = connect("data/dataset_review.db")
dataset = pd.read_sql("SELECT * FROM games", conn)
conn.close()

dataset.set_index("index", inplace=True)
dataset = dataset.loc[dataset["summary"].notnull()]
dataset = dataset.loc[dataset["reviews"].notnull()]
dataset = dataset[dataset["user_review"] != -1]
dataset.reset_index(drop=True, inplace=True)

In [3]:
tfv = CountVectorizer(analyzer="word", stop_words = 'english')
tfv.fit(dataset["summary"].str.lower())
summary = tfv.transform(dataset["summary"])
kmeans = KMeans(n_clusters=10, random_state=7).fit(summary)
summary_kmeans = np.array(kmeans.labels_)
np.save("data/summary_kmeans.npy", summary_kmeans)

# summary_kmeans = np.load("data/summary_kmeans.npy", allow_pickle=True)

In [4]:
categorical_labels = to_categorical(summary_kmeans, num_classes=10)
dataset = pd.concat([dataset, pd.DataFrame(categorical_labels, columns=["summary_kmean_" + str(i) for i in range(10)])], axis=1)
y = dataset["user_review"]
x = dataset.drop(columns=["user_review"])

In [6]:

tfv = CountVectorizer(analyzer="word", stop_words = 'english')
tfv.fit(x)
x = tfv.transform(x)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x.text.values, y, random_state=7, test_size=0.2, shuffle=True)