In [65]:
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords
from database.pymysql_conn import DataBase
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from collections import Counter
from pathlib import Path
from nltk.corpus import stopwords
from scipy.stats import entropy
from matplotlib import pyplot as plt

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
np.random.seed(777)

In [3]:
db = DataBase()

In [10]:
SQL="""
SELECT 
    T1.appid,
    T1.avg_player_count,
    T1.gameName,
    T1.release_date,
    T2.publishedAt,
    T2.text,    
    DATEDIFF(T2.publishedAt, T1.release_date) as datediff
FROM
    (SELECT 
        A.appid, A.gameName, A.avg_player_count, B.release_date
    FROM
        (SELECT 
        *
    FROM
        yt.games) A
    JOIN (SELECT 
        appid, name, MAX(release_date) AS release_date
    FROM
        oasis.app_info2
    GROUP BY appid) AS B ON A.appid = B.appid) T1
        LEFT JOIN
    (SELECT 
        appid, gameName, text, publishedAt
    FROM
        steam.yt_comment
    WHERE
        filter = 0 AND language = 'en') T2 ON T1.appid = T2.appid
WHERE
    DATEDIFF(T2.publishedAt, T1.release_date) <= 300
"""

In [11]:
df = db.to_df(SQL)

In [13]:
success = ["Dota Underlords",
            "Borderlands GOTY Enhanced",
            "Anno 1800",
            "F1 2019",
            "DEAD OR ALIVE Xtreme Venus Vacation",
            "RAGE 2",
            "OCTOPATH TRAVELER",
            "Lords Mobile",
            "Pro Cycling Manager 2019",
            "Ironsight",
            "Yakuza Kiwami 2", # 10
            "Pagan Online",
            "Monster Girl Island Prologue",
            "Winning Post 9",
            "Assassins Creed III Remastered",
            "Otakus Adventure",
            "SUPER DRAGON BALL HEROES WORLD MISSION",
            "AVA Dog Tag"] # 500++

In [14]:
df_success = df[df['gameName'].isin(success)]
df_fail = df[~df['gameName'].isin(success)]

In [19]:
# 성공게임 출시전 댓글
groupA = df_success[df_success['datediff'] < 0]
# 성공게임 출시후 댓글
groupB = df_success[df_success['datediff'] >= 0]
# 보통게임 출시후 댓글
groupC = df_fail[df_fail['datediff'] >=0]

In [22]:
def clean_text(text):
    t = re.sub('[^a-zA-Z0-9]',' ', text).strip()
    t = re.sub(' +', ' ', t)
    t = t.lower()
    # Stop words
    t = remove_stopwords(t)
    return t

In [23]:
comments_A = groupA['text'].apply(clean_text).str.split().to_numpy()
comments_B = groupB['text'].apply(clean_text).str.split().to_numpy()
comments_C = groupC['text'].apply(clean_text).str.split().to_numpy()

In [49]:
gameName_A = groupA['gameName'].to_numpy()
gameName_B = groupB['gameName'].to_numpy()
gameName_C = groupC['gameName'].to_numpy()

# Count

In [69]:
corpus_A = [word for sent in comments_A for word in sent]
corpus_B = [word for sent in comments_B for word in sent]
corpus_C = [word for sent in comments_C for word in sent]

In [106]:
counter_A = Counter(corpus_A)
counter_B = Counter(corpus_B)
counter_C = Counter(corpus_C)

In [113]:
pd.DataFrame(counter_A.most_common(10), columns=['word', 'count'])

Unnamed: 0,word,count
0,game,27240
1,s,16830
2,t,14332
3,like,14136
4,looks,7189
5,m,6199
6,good,5671
7,2,5652
8,play,5500
9,games,4936


In [117]:
pd.DataFrame(counter_B.most_common(10), columns=['word', 'count'])

Unnamed: 0,word,count
0,game,12364
1,s,8150
2,t,6493
3,like,6152
4,2,3434
5,play,3331
6,good,2884
7,love,2700
8,m,2572
9,3,2499


In [116]:
pd.DataFrame(counter_C.most_common(10), columns=['word', 'count'])

Unnamed: 0,word,count
0,s,32486
1,t,24409
2,game,24254
3,like,21451
4,mark,16947
5,m,14156
6,play,12526
7,love,11821
8,video,9971
9,fnaf,9920


In [107]:
[(word, cnt) for word, cnt in counter_A.most_common() if cnt > 1000]

[('game', 27240),
 ('s', 16830),
 ('t', 14332),
 ('like', 14136),
 ('looks', 7189),
 ('m', 6199),
 ('good', 5671),
 ('2', 5652),
 ('play', 5500),
 ('games', 4936),
 ('video', 4624),
 ('rage', 4405),
 ('love', 4231),
 ('3', 3902),
 ('creed', 3691),
 ('time', 3661),
 ('better', 3426),
 ('got', 3340),
 ('know', 3134),
 ('great', 3069),
 ('think', 2946),
 ('best', 2844),
 ('want', 2779),
 ('wait', 2778),
 ('story', 2742),
 ('new', 2732),
 ('1', 2727),
 ('ve', 2553),
 ('people', 2523),
 ('hope', 2492),
 ('played', 2473),
 ('ac', 2467),
 ('doom', 2325),
 ('look', 2298),
 ('trailer', 2253),
 ('favorite', 2220),
 ('mobile', 2218),
 ('fun', 2164),
 ('need', 2122),
 ('assassin', 2118),
 ('way', 2092),
 ('playing', 2077),
 ('going', 2061),
 ('lol', 2050),
 ('actually', 2022),
 ('shit', 2009),
 ('ll', 1919),
 ('buy', 1892),
 ('gameplay', 1873),
 ('4', 1836),
 ('mad', 1824),
 ('d', 1821),
 ('switch', 1815),
 ('max', 1787),
 ('bad', 1770),
 ('bethesda', 1761),
 ('characters', 1760),
 ('remastered', 

In [85]:
# count_A
# count_B
# count_C

# Word2Vec

In [53]:
sentence_A = [[g] + s for g, s in zip(gameName_A, comments_A)]
sentence_B = [[g] + s for g, s in zip(gameName_B, comments_B)]
sentence_C = [[g] + s for g, s in zip(gameName_C, comments_C)]

In [55]:
len(comments_A), len(comments_B), len(comments_C) 

(94181, 52811, 197473)

In [56]:
len(sentence_A), len(sentence_B), len(sentence_C) 

(94181, 52811, 197473)

In [58]:
embedding_model_A = Word2Vec(sentence_A, size=100,
                           window=10, min_count=20, workers=16, iter=3, sg=1, seed=777)
embedding_model_B = Word2Vec(sentence_B, size=100,
                           window=10, min_count=20, workers=16, iter=3, sg=1, seed=777)
embedding_model_C = Word2Vec(sentence_C, size=100,
                           window=10, min_count=20, workers=16, iter=3, sg=1, seed=777)

In [64]:
len(embedding_model_A.wv.vocab.keys()), len(embedding_model_B.wv.vocab.keys()), len(embedding_model_C.wv.vocab.keys())

(4165, 2978, 5969)

In [66]:
tsne = TSNE(n_components=2, random_state=777)

In [77]:
coord_A = tsne.fit_transform(embedding_model_A.wv[embedding_model_A.wv.vocab.keys()])
coord_B = tsne.fit_transform(embedding_model_B.wv[embedding_model_B.wv.vocab.keys()])
coord_C = tsne.fit_transform(embedding_model_C.wv[embedding_model_C.wv.vocab.keys()])

In [90]:
def coord2df(coord, emb, gameName):
    all_target= [x for x in emb.wv.vocab.keys()]
    tsne_coord = pd.concat([pd.DataFrame(coord), pd.Series(all_target)], axis=1)
    tsne_coord.columns = ['x_coord', 'y_coord', 'game']
    
    label = []
    alpha = []
    names = np.unique(gameName)
    for x in emb.wv.vocab.keys():
        if x in names:
            if x in success:
                label.append("green")
                alpha.append(1)
            else:
                label.append("red")
                alpha.append(1)
        else:
            label.append("gray")
            alpha.append(0.3)
    
    tsne_coord['color'] = label
    tsne_coord['alpha'] = alpha
    
    return tsne_coord

In [87]:
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [95]:
plot_data_A = ColumnDataSource(coord2df(coord_A, embedding_model_A, gameName_A))
plot_data_B = ColumnDataSource(coord2df(coord_B, embedding_model_B, gameName_B))
plot_data_C = ColumnDataSource(coord2df(coord_C, embedding_model_C, gameName_C))

In [108]:
def plot_tsne(plot_data, filename):
    tsne_plot = figure(title='t-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   active_scroll='wheel_zoom'
                  )

    tsne_plot.add_tools( HoverTool(tooltips = '@game') )
    tsne_plot.circle('x_coord', 'y_coord', source=plot_data, 
                     size=10, fill_color='color', fill_alpha='alpha', line_color=None)

    # adjust visual elements of the plot
    tsne_plot.title.text_font_size = value('16pt')
    tsne_plot.xaxis.visible = False
    tsne_plot.yaxis.visible = False
    tsne_plot.grid.grid_line_color = None
    tsne_plot.outline_line_color = None

    # show time!
    output_file( filename + ".html")
    show(tsne_plot)

In [109]:
plot_tsne(plot_data_A, "group_A")
plot_tsne(plot_data_B, "group_B")
plot_tsne(plot_data_C, "group_C")