In [1]:
from database.pymysql_conn import DataBase
from util.text import clean_text

import pandas as pd
import numpy as np
import re
import nltk
import pickle

from collections import Counter
from pathlib import Path
from nltk.corpus import stopwords
from scipy.stats import entropy

from typing import Dict

In [2]:
db = DataBase()

## from DB

In [31]:
# df_success = db.to_df("""
# SELECT * FROM yt_comment where appid in (SELECT appid FROM get_new_games where avg_player_count > 1000) and filter=0 and language='en';
# """)

# df_fail = db.to_df("""
# SELECT * FROM yt_comment where appid in (SELECT appid FROM get_new_games where avg_player_count < 1000) and filter=0 and language='en';
# """)

df_all = db.to_df("""
SELECT * FROM
(SELECT * FROM yt_comment)A
LEFT JOIN	
(SELECT appid, first_date, avg_player_count FROM get_new_games)B ON A.appid = B.appid;
""")

## to PKL

In [3]:
path = Path('cached')

In [35]:
with open(path/'df_all_yt_commnet.pkl', 'wb+') as f:
    pickle.dump(df_all, f, pickle.HIGHEST_PROTOCOL)

## from PKL

In [4]:
with open(path/'df_all_yt_commnet.pkl', 'rb') as f:
    df_all = pickle.load(f)

## filtering

In [5]:
# 중복 컬럼 제거
df_all = df_all.loc[:,~df_all.columns.duplicated()]

In [6]:
# 출시전 댓글만 남기기
df_all = df_all[df_all['publishedAt']<df_all['first_date']]

In [7]:
# likeCount 1개 이상인 경우만 필터링
df_all = df_all[df_all['likeCount'] > 1]

In [8]:
# outlier
# 732690 : FIVE NIGHTS AT FREDDYS VR HELP WANTED
df_all = df_all.drop(df_all[df_all['appid']==732690].index)

### split data

In [9]:
df_success = df_all[df_all['avg_player_count'] > 5000]
df_fail = df_all[df_all['avg_player_count'] < 500]

In [10]:
df_games = {}

In [11]:
text_all = {}
for gameName in df_all.gameName.unique():
    raw = df_all[df_all['gameName']==gameName]['text'].tolist()
    text = ' '.join([clean_text(t) for t in raw])
    text_all[gameName] = text

In [12]:
stw = stopwords.words('english')
token_all = {}
for key, val in text_all.items():
    tok = nltk.word_tokenize(val)
    token_all[key] = [w for w in tok if not w in stw]

In [13]:
token_cnt_all = {}
for key, val in token_all.items():
    token_cnt_all[key] = {k: v for k, v in sorted(Counter(token_all[key]).items(), key=lambda item: item[1], reverse=True)}

## tokenizing

In [14]:
raw_text_success = df_success['text'].tolist()

In [15]:
text_success = ' '.join([clean_text(t) for t in raw_text_success])

In [16]:
raw_text_fail = df_fail['text'].tolist()

In [17]:
text_fail = ' '.join([clean_text(t) for t in raw_text_fail])

In [18]:
stw = stopwords.words('english')

token_success = nltk.word_tokenize(text_success)
token_success = [w for w in token_success if not w in stw]

token_fail = nltk.word_tokenize(text_fail)
token_fail = [w for w in token_fail if not w in stw]

In [19]:
token_cnt_success = {k: v for k, v in sorted(Counter(token_success).items(), key=lambda item: item[1], reverse=True)}
token_cnt_fail = {k: v for k, v in sorted(Counter(token_fail).items(), key=lambda item: item[1], reverse=True)}

## entropy

In [20]:
word_list = list(set().union(token_cnt_success.keys(), token_cnt_fail.keys()))

In [21]:
entropy_list = {}
for word in word_list:
    s = token_cnt_success.get(word, 0)
    f = token_cnt_fail.get(word, 0)
    if s >20 or f>20:
        if entropy([s, f], base=2) > 0.0:
            side = "success" if s > f else "fail"
            entropy_list[word] = (entropy([s, f], base=2), side)

In [22]:
{k: v for k, v in sorted(entropy_list.items(), key=lambda item: item[1], reverse=False)}

{'civilization': (0.02427829445785703, 'fail'),
 'history': (0.0252118658929347, 'fail'),
 'civ': (0.03445517348707714, 'fail'),
 'ii': (0.04022215086538087, 'fail'),
 'french': (0.04367566967447157, 'fail'),
 'dark': (0.04541469233379411, 'fail'),
 'pewdiepie': (0.04762494212903133, 'fail'),
 'queen': (0.048931398305011205, 'fail'),
 'black': (0.049614073383553024, 'fail'),
 'france': (0.050436967245965525, 'fail'),
 'girl': (0.050556933576743124, 'fail'),
 'elite': (0.05230353671513271, 'fail'),
 'v': (0.054185698091702955, 'fail'),
 'peace': (0.05503793508320906, 'fail'),
 'vi': (0.05622035839027107, 'fail'),
 'star': (0.06634397526831169, 'fail'),
 'fix': (0.06941016793921241, 'fail'),
 'industrial': (0.07050175960541472, 'fail'),
 'de': (0.07050175960541472, 'fail'),
 'ps2': (0.07201647168541384, 'fail'),
 'sniper': (0.0741494835779208, 'fail'),
 'e': (0.07470433715479614, 'fail'),
 'ai': (0.07498515348038796, 'fail'),
 'anno': (0.0755537086576868, 'success'),
 'que': (0.076131690

In [23]:
def get_entropy(word: str, token_a: Dict, token_b):
    cnt_a, cnt_b = token_a.get(word,0), token_b.get(word,0)
    ent = entropy([cnt_a, cnt_b], base=2)
    return cnt_a, cnt_b, ent

In [24]:
def get_count(word, token_cnt):
    for key, val in token_cnt.items():
        if word in val:
            print(key, val[word])

In [32]:
get_entropy("history", token_cnt_success, token_cnt_fail)

(1, 399, 0.0252118658929347)

In [33]:
get_count("history", token_cnt_all)

DEAD OR ALIVE Xtreme Venus Vacation 1
Assassins Creed III Remastered 9
SUPER DRAGON BALL HEROES WORLD MISSION 1
帝国与文明 369
Anno 1800 1
Embark 2
Furry Girl 3
STAR WARS Battlefront Classic 2004 5
Splitgate Arena Warfare Beta 1
Yakuza Kiwami 2 5
Battle for the Galaxy 2
RAGE 2 8
Layers of Fear 2 4
Conan Unconquered 1
missed messages 1
Quake II RTX 2
OCTOPATH TRAVELER 5
SpaceEngine 1
Project Zero Deaths 2
Monster Girl Island Prologue 1
Fantasy Girl 5
pact with a witch 1
