In [1]:
import nltk
import spacy
from collections import Counter
from database.pymysql_conn import DataBase
from tqdm import tqdm

import pickle

tqdm.pandas()

In [2]:
db = DataBase()

In [3]:
SQL="""
SELECT 
    T1.appid,
    T1.avg_player_count,
    T1.gameName,
    T1.release_date,
    T2.publishedAt,
    T2.text,    
    DATEDIFF(T2.publishedAt, T1.release_date) as datediff
FROM
    (SELECT 
        A.appid, A.gameName, A.avg_player_count, B.release_date
    FROM
        (SELECT 
        *
    FROM
        yt.games) A
    JOIN (SELECT 
        appid, name, MAX(release_date) AS release_date
    FROM
        oasis.app_info2
    GROUP BY appid) AS B ON A.appid = B.appid) T1
        LEFT JOIN
    (SELECT 
        appid, gameName, text, publishedAt
    FROM
        steam.yt_comment
    WHERE
        filter = 0 AND language = 'en') T2 ON T1.appid = T2.appid
WHERE
    DATEDIFF(T2.publishedAt, T1.release_date) <= 300
"""

In [4]:
df = db.to_df(SQL)

In [5]:
success = ["Dota Underlords",
            "Borderlands GOTY Enhanced",
            "Anno 1800",
            "F1 2019",
            "DEAD OR ALIVE Xtreme Venus Vacation",
            "RAGE 2",
            "OCTOPATH TRAVELER",
            "Lords Mobile",
            "Pro Cycling Manager 2019",
            "Ironsight",
            "Yakuza Kiwami 2", # 10
            "Pagan Online",
            "Monster Girl Island Prologue",
            "Winning Post 9",
            "Assassins Creed III Remastered",
            "Otakus Adventure",
            "SUPER DRAGON BALL HEROES WORLD MISSION",
            "AVA Dog Tag"] # 500++

In [6]:
df_success = df[df['gameName'].isin(success)]
df_fail = df[~df['gameName'].isin(success)]

In [7]:
# 성공게임 출시전 댓글
groupA = df_success[df_success['datediff'] < 0]
# 성공게임 출시후 댓글
groupB = df_success[df_success['datediff'] >= 0]
# 보통게임 출시후 댓글
groupC = df_fail[df_fail['datediff'] >=0]

In [13]:
is_noun = lambda pos: pos[:2] == 'NN'
is_adj = lambda pos: pos[:2] == 'JJ'
is_verb = lambda pos: 'VB' in pos[:2]

def pos_tagging(text, func):
    tokenized = nltk.word_tokenize(text)
    pos = [word for (word, pos) in nltk.pos_tag(tokenized) if func(pos)] 
    return pos

In [35]:
def get_counts(df, func):
    
    pos = df['text'].progress_apply(pos_tagging, args=(func,))
    unpacked = [word for sent in pos for word in sent]
    cnt = Counter(unpacked)
    return cnt

def get_noun_count(df):
    
    nouns = df['text'].progress_apply(pos_tagging_noun)
    unpacked = [word for sent in nouns for word in sent]
    
    noun_cnt = Counter(unpacked)
    
    return noun_cnt

def get_adj_count(df):
    
    adjs = df['text'].progress_apply(pos_tagging_adj)
    unpacked = [word for sent in adjs for word in sent]
    
    adj_cnt = Counter(unpacked)
    
    return adj_cnt

In [41]:
result = {}
for group, groupName in zip((groupA, groupB, groupC), ("A", "B", "C")):
    result[groupName] = {}
    for func, tag in zip((is_noun, is_adj, is_verb), ("Noun", "Adj", "Verb")):
        pos = group['text'].progress_apply(pos_tagging, args=(func,))
        unpacked = [word for sent in pos for word in sent]
        cnt = Counter(unpacked)
        result[groupName][tag] = cnt

100%|██████████████████████████████████████████████████████████████| 94181/94181 [02:48<00:00, 557.70it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [02:50<00:00, 552.05it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [02:52<00:00, 547.55it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:34<00:00, 557.79it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:35<00:00, 555.42it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:35<00:00, 555.56it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [05:43<00:00, 574.66it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [05:42<00:00, 576.22it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [05:46<00:00, 570.66it/s]


In [42]:
# with open('pos_tag_result.pickle', 'wb') as f:
#     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)



In [51]:
with open('pos_tag_result.pickle', 'rb') as f:
    result = pickle.load(f)

In [44]:
result.keys()

dict_keys(['A', 'B', 'C'])

In [45]:
result['A'].keys()

dict_keys(['Noun', 'Adj', 'Verb'])

In [48]:
result['A']['Noun'].most_common(10)

[('game', 26111),
 ('games', 4696),
 ('i', 4350),
 ('video', 4180),
 ('time', 3423),
 ('story', 2585),
 ('people', 2297),
 ('’', 2182),
 ('trailer', 2001),
 ('way', 1961)]

In [49]:
result['B']['Noun'].most_common(10)

[('game', 11811),
 ('i', 2836),
 ('video', 2187),
 ('games', 2041),
 ('time', 1852),
 ('channel', 1736),
 ('’', 1319),
 ('way', 1164),
 ('t', 1097),
 ('series', 992)]

In [50]:
result['C']['Noun'].most_common(10)

[('game', 22391),
 ('Mark', 11497),
 ('i', 9726),
 ('video', 9050),
 ('time', 6911),
 ('’', 6539),
 ('Jack', 5729),
 ('t', 5105),
 ('Me', 5079),
 ('people', 4843)]