In [26]:
import nltk
import spacy
from collections import Counter
from database.pymysql_conn import DataBase
from tqdm import tqdm

import pickle

tqdm.pandas()

In [6]:
db = DataBase()

In [7]:
SQL="""
SELECT 
    T1.appid,
    T1.avg_player_count,
    T1.gameName,
    T1.release_date,
    T2.publishedAt,
    T2.text,    
    DATEDIFF(T2.publishedAt, T1.release_date) as datediff
FROM
    (SELECT 
        A.appid, A.gameName, A.avg_player_count, B.release_date
    FROM
        (SELECT 
        *
    FROM
        yt.games) A
    JOIN (SELECT 
        appid, name, MAX(release_date) AS release_date
    FROM
        oasis.app_info2
    GROUP BY appid) AS B ON A.appid = B.appid) T1
        LEFT JOIN
    (SELECT 
        appid, gameName, text, publishedAt
    FROM
        steam.yt_comment
    WHERE
        filter = 0 AND language = 'en') T2 ON T1.appid = T2.appid
WHERE
    DATEDIFF(T2.publishedAt, T1.release_date) <= 300
"""

In [8]:
df = db.to_df(SQL)

In [9]:
spacy_en = spacy.load("en")

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
line = ' '.join(tokenize(df['text'][0]))

In [11]:
df['text'][0]

"Oh trust me it got boring soo fast when I first seen it. Ill never buy the 3rd game especially  The DLC. The first and second DOAX is much wayy better because not only it got soo much activities it's what Beach Vacation all about I wish the Nostoligia Bikinis come back and all the other girls in the game. I understand because they want to make a lewd waifu game which it already has been in the last two. I thought the 3rd game going to give us more characters and activities and tournaments for volleyball but got very back to the basic with zoom the titties and lolis."

In [12]:
lines = line
# function to test if something is a noun
is_noun = lambda pos: 'NN' in pos[:2]
is_adj = lambda pos: pos[:2] == 'JJ'
# do the nlp stuff
tokenized = nltk.word_tokenize(lines)
nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
adjectives = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adj(pos)]

In [13]:
is_noun = lambda pos: pos[:2] == 'NN'
is_adj = lambda pos: pos[:2] == 'JJ'

def pos_tagging_noun(text):
    tokenized = nltk.word_tokenize(text)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    return nouns

def pos_tagging_adj(text):
    tokenized = nltk.word_tokenize(text)
    adjectives = [word for (word, pos) in nltk.pos_tag(tokenized) if is_adj(pos)]
    return adjectives

In [19]:
def get_noun_count(df):
    
    nouns = df['text'].progress_apply(pos_tagging_noun)
    unpacked = [word for sent in nouns for word in sent]
    
    noun_cnt = Counter(unpacked)
    
    return noun_cnt

def get_adj_count(df):
    
    adjs = df['text'].progress_apply(pos_tagging_adj)
    unpacked = [word for sent in adjs for word in sent]
    
    adj_cnt = Counter(unpacked)
    
    return adj_cnt

In [20]:
noun_cnt = get_noun_count(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 436056/436056 [12:27<00:00, 583.27it/s]


In [21]:
adj_cnt = get_adj_count(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 436056/436056 [12:27<00:00, 583.56it/s]


In [23]:
noun_cnt.most_common()

[('game', 72109),
 ('i', 20825),
 ('video', 19369),
 ('time', 14999),
 ('Mark', 14805),
 ('games', 13634),
 ('’', 12383),
 ('t', 9683),
 ('people', 9303),
 ('Jack', 8732),
 ('thing', 8053),
 ('way', 7781),
 ('Me', 7307),
 ('videos', 6638),
 ('FNAF', 6482),
 ('one', 6446),
 ('something', 6412),
 ('Freddy', 6265),
 ('man', 6042),
 ('series', 5364),
 ('mark', 5138),
 ('lot', 5118),
 ('day', 5071),
 ('VR', 5044),
 ('life', 5011),
 ('channel', 4968),
 ('story', 4867),
 ('lol', 4862),
 ('fnaf', 4739),
 ('part', 4713),
 ('please', 4706),
 ('guy', 4682),
 ('THE', 4645),
 ('s', 4438),
 ('play', 4375),
 ('Hey', 4301),
 ('someone', 4269),
 ('things', 4250),
 ('Dan', 4243),
 ('night', 4192),
 ('anyone', 4170),
 ('”', 4101),
 ('end', 4087),
 ('years', 4082),
 ('shit', 3943),
 ('world', 3845),
 ('character', 3813),
 ('fun', 3687),
 ('Bonnie', 3569),
 ('“', 3521),
 ('version', 3467),
 ('voice', 3462),
 ('gameplay', 3351),
 ('music', 3338),
 ('jack', 3322),
 ('work', 3320),
 ('Thank', 3316),
 ('name',

In [24]:
adj_cnt.most_common()

[('good', 15712),
 ('more', 13533),
 ('first', 10761),
 ('i', 9339),
 ('new', 8594),
 ('much', 8385),
 ('best', 8292),
 ('great', 7762),
 ('’', 7337),
 ('other', 6726),
 ('same', 6070),
 ('u', 5999),
 ('bad', 5522),
 ('better', 5262),
 ('s', 4723),
 ('favorite', 4708),
 ('many', 4688),
 ('last', 4423),
 ('old', 4407),
 ('real', 4080),
 ('little', 4024),
 ('next', 3883),
 ('hard', 3870),
 ('only', 3857),
 ('original', 3500),
 ('happy', 3447),
 ('scary', 3262),
 ('cool', 3210),
 ('sure', 3208),
 ('awesome', 3095),
 ('nice', 3093),
 ('m', 3076),
 ('whole', 3033),
 ('funny', 3033),
 ('different', 2876),
 ('fnaf', 2808),
 ('big', 2755),
 ('amazing', 2668),
 ('full', 2505),
 ('few', 2414),
 ('least', 2401),
 ('long', 2320),
 ('t', 2242),
 ('free', 2216),
 ('second', 2214),
 ('right', 2171),
 ('purple', 2094),
 ('wrong', 2075),
 ('black', 2000),
 ('ur', 1986),
 ('main', 1907),
 ('most', 1866),
 ('own', 1859),
 ('such', 1826),
 ('able', 1775),
 ('dead', 1673),
 ('early', 1652),
 ('Good', 1634)

In [27]:
# with open('noun.pickle', 'wb') as f:
#     pickle.dump(noun_cnt, f, pickle.HIGHEST_PROTOCOL)

In [28]:
# with open('adj.pickle', 'wb') as f:
#     pickle.dump(adj_cnt, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('noun.pickle', 'rb') as f:
#     data = pickle.load(f)