In [27]:
import nltk
import spacy
from collections import Counter
from database.pymysql_conn import DataBase
from tqdm import tqdm
from gensim.parsing.preprocessing import remove_stopwords
from pattern.text.en import singularize
import pandas as pd

import pickle

tqdm.pandas()

In [8]:
db = DataBase()

In [9]:
SQL="""
SELECT 
    T1.appid,
    T1.avg_player_count,
    T1.gameName,
    T1.release_date,
    T2.publishedAt,
    T2.text,    
    DATEDIFF(T2.publishedAt, T1.release_date) as datediff
FROM
    (SELECT 
        A.appid, A.gameName, A.avg_player_count, B.release_date
    FROM
        (SELECT 
        *
    FROM
        yt.games) A
    JOIN (SELECT 
        appid, name, MAX(release_date) AS release_date
    FROM
        oasis.app_info2
    GROUP BY appid) AS B ON A.appid = B.appid) T1
        LEFT JOIN
    (SELECT 
        appid, gameName, text, publishedAt
    FROM
        steam.yt_comment
    WHERE
        filter = 0 AND language = 'en') T2 ON T1.appid = T2.appid
WHERE
    DATEDIFF(T2.publishedAt, T1.release_date) <= 300
"""

In [10]:
df = db.to_df(SQL)

In [11]:
success = ["Dota Underlords",
            "Borderlands GOTY Enhanced",
            "Anno 1800",
            "F1 2019",
            "DEAD OR ALIVE Xtreme Venus Vacation",
            "RAGE 2",
            "OCTOPATH TRAVELER",
            "Lords Mobile",
            "Pro Cycling Manager 2019",
            "Ironsight",
            "Yakuza Kiwami 2", # 10
            "Pagan Online",
            "Monster Girl Island Prologue",
            "Winning Post 9",
            "Assassins Creed III Remastered",
            "Otakus Adventure",
            "SUPER DRAGON BALL HEROES WORLD MISSION",
            "AVA Dog Tag"] # 500++

In [12]:
df_success = df[df['gameName'].isin(success)]
df_fail = df[~df['gameName'].isin(success)]

In [13]:
# 성공게임 출시전 댓글
groupA = df_success[df_success['datediff'] < 0]
# 성공게임 출시후 댓글
groupB = df_success[df_success['datediff'] >= 0]
# 보통게임 출시후 댓글
groupC = df_fail[df_fail['datediff'] >=0]

In [14]:
# for Pandas
def normalize_text(text):
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

In [15]:
def text_cleaning_pipeline(df):
    text = df['text']
    text = normalize_text(text)
    

In [16]:
spacy_en = spacy.load("en")

def tokenize(text):
    # text = normalize_text(text)
    # text = remove_stopwords(text)
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [17]:
def token_singularize(tokens):
    return [singularize(x) for x in tokens]

In [18]:
def remove_single_word(tokens):
    return [x for x in tokens if len(x) > 1]

In [19]:
is_noun = lambda pos: 'NN' in pos[:2]
is_adj = lambda pos: 'JJ' in pos[:2]
is_verb = lambda pos: 'VB' in pos[:2]

def pos_tagging(text, func):
    tokenized = tokenize(text)
    pos = [word for (word, pos) in nltk.pos_tag(tokenized) if func(pos)] 
    return pos

In [20]:
def pos_tagging_test(text, func):
    # tokenized = nltk.word_tokenize(text)
    tokenized = tokenize(text)
    
    pos = [(word, pos) for (word, pos) in nltk.pos_tag(tokenized) if func(pos)] 
    return pos

In [21]:
def get_counts(df, func):
    
    pos = df['text'].progress_apply(pos_tagging, args=(func,))
    unpacked = [word for sent in pos for word in sent]
    cnt = Counter(unpacked)
    return cnt

# TEST

In [22]:
test_group = groupA[:100]

In [17]:
test_input = normalize_text(test_group['text'])
pos = test_input.progress_apply(pos_tagging, args=(is_noun,))

100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 364.88it/s]


In [18]:
pos = pos.progress_apply(token_singularize)

100%|█████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 6665.24it/s]


In [19]:
pos = pos.progress_apply(remove_single_word)

100%|████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 99983.41it/s]


In [20]:
unpacked = [word for sent in pos for word in sent]

In [21]:
test_group['token'] = pos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
test_group

Unnamed: 0,appid,avg_player_count,gameName,release_date,publishedAt,text,datediff,token
47,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2019-02-10 05:30:53,"I see, you are an AI of culture as well",-44,"[ai, culture]"
48,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-12-26 19:25:53,Perv AI-chan is the best!,-90,"[perv, ai, chan]"
49,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-11-15 23:00:43,"8:05 Oh, she's so sweet. If I were twenty year...",-131,[year]
50,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-11-10 17:58:37,Why are the Japanese so obsessed with boobs? D...,-136,"[japanese, boob, as, way]"
51,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-25 17:13:17,Why a cute girl plays other sexy girls looks c...,-152,"[cute, girl, girl, cuter, man, man, unfair]"
52,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-18 16:58:47,Marie is just the cutest.,-159,"[marie, cutest]"
53,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-15 22:07:08,6:30 Ai-chan’s predator face,-162,"[ai, chan, predator, face]"
54,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-12 12:38:57,I'm having a hard time reading the subtitles w...,-165,"[time, subtitle, eye, tiddy]"
55,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-10 12:55:07,Might as well play an illusion ero game Its t...,-167,"[illusion, ero, game, stuff, lewd]"
56,958260,2151.9409,DEAD OR ALIVE Xtreme Venus Vacation,2019-03-26,2018-10-08 03:47:56,"Boops tiddy. ""Don't sue me."" I love this channel.",-169,"[boop, channel]"


In [49]:
for group, groupName in zip((groupA[:10], groupB[:10], groupC[:10]), ("A", "B", "C")):
    print(groupName, "--------------------", flush=True)
    tokens=[]
    noun_tok = []
    adj_tok =[]
    verb_tok =[]
    for func, tag in zip((is_noun, is_adj, is_verb), ("Noun", "Adj", "Verb")):
        # 토큰화
        pos = group['text'].progress_apply(pos_tagging, args=(func,))
        # 단수화
        pos = pos.progress_apply(token_singularize)
        # 잘못된(1글자) 단어 제거
        pos = pos.progress_apply(remove_single_word)
        
        token = pos.to_list()
        
        # print(token)
        
        if tag == "Noun":
            noun_tok = token
        
        if tag == "Adj":
            adj_tok = token
        
        if tag == "Verb":
            verb_tok = token
    
    for a, b, c in zip(noun_tok, adj_tok, verb_tok):
        s = a + b + c
        tokens.append(s)
    
    print(len(tokens))
    
    group['token'] = tokens

A --------------------


100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 526.39it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 3337.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 526.20it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9995.96it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9998.34it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 499.88it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4997.38it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9998.34it/s]

10
B --------------------



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 208.10it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1999.67it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9998.34it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 302.96it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4999.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 302.97it/s]
100%

10
C --------------------


100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 293.39it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 3333.05it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9998.34it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 454.43it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4997.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 454.45it/s]
100%|███████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4999.17it/s]
100%|██████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10000.72it/s]


10


# df

In [51]:
for group, groupName in zip((groupA, groupB, groupC), ("A", "B", "C")):
    print(groupName, "--------------------", flush=True)
    tokens=[]
    noun_tok = []
    adj_tok =[]
    verb_tok =[]
    for func, tag in zip((is_noun, is_adj, is_verb), ("Noun", "Adj", "Verb")):
        # 토큰화
        pos = group['text'].progress_apply(pos_tagging, args=(func,))
        # 단수화
        pos = pos.progress_apply(token_singularize)
        # 잘못된(1글자) 단어 제거
        pos = pos.progress_apply(remove_single_word)
        
        token = pos.to_list()
        
        # print(token)
        
        if tag == "Noun":
            noun_tok = token
        
        if tag == "Adj":
            adj_tok = token
        
        if tag == "Verb":
            verb_tok = token
    
    for a, b, c in zip(noun_tok, adj_tok, verb_tok):
        s = a + b + c
        tokens.append(s)
    
    print(len(tokens))
    
    group['token'] = tokens

A --------------------


100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:00<00:00, 522.99it/s]
100%|█████████████████████████████████████████████████████████████| 94181/94181 [00:21<00:00, 4458.23it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 433560.32it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:00<00:00, 520.62it/s]
100%|████████████████████████████████████████████████████████████| 94181/94181 [00:06<00:00, 15345.35it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 297290.64it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:01<00:00, 519.63it/s]
100%|█████████████████████████████████████████████████████████████| 94181/94181 [00:12<00:00, 7332.36it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 490415.43it/s]


94181
B --------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:45<00:00, 500.44it/s]
100%|█████████████████████████████████████████████████████████████| 52811/52811 [00:11<00:00, 4564.00it/s]
100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 382601.78it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:45<00:00, 498.88it/s]
100%|████████████████████████████████████████████████████████████| 52811/52811 [00:03<00:00, 14919.20it/s]
100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 502852.20it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:45<00:00, 499.49it/s]
100%|

52811
C --------------------


100%|████████████████████████████████████████████████████████████| 197473/197473 [06:36<00:00, 497.85it/s]
100%|███████████████████████████████████████████████████████████| 197473/197473 [00:42<00:00, 4694.34it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 280836.78it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [06:36<00:00, 498.24it/s]
100%|██████████████████████████████████████████████████████████| 197473/197473 [00:11<00:00, 17818.26it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 565698.11it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [06:34<00:00, 500.42it/s]
100%|███████████████████████████████████████████████████████████| 197473/197473 [00:25<00:00, 7839.12it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 287376.77it/s]


197473


In [None]:
def word_contained(tokens, word):
    
    for token in tokens:
        if token == word:
            return True
    return False

In [None]:
def get_full_text_with_word(group, word):
    return group[group["token"].progress_apply(word_contained, args=(word,))]

In [52]:
with open('groupA.pickle', 'wb') as f:
    pickle.dump(groupA, f, pickle.HIGHEST_PROTOCOL)

with open('groupB.pickle', 'wb') as f:
    pickle.dump(groupB, f, pickle.HIGHEST_PROTOCOL)

with open('groupC.pickle', 'wb') as f:
    pickle.dump(groupC, f, pickle.HIGHEST_PROTOCOL)

In [2]:
# with open('groupA.pickle', 'rb') as f:
#     groupA = pickle.load(f)
    
# with open('groupB.pickle', 'rb') as f:
#     groupB = pickle.load(f)
    
# with open('groupC.pickle', 'rb') as f:
#     groupC = pickle.load(f)

In [7]:
get_full_text_with_word(groupB, "channel")

100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 584797.20it/s]


Unnamed: 0,appid,avg_player_count,gameName,release_date,publishedAt,text,datediff,token
2806,911400,694.1605,Assassins Creed III Remastered,2019-03-30,2019-03-31 06:01:45,"Wow, huge throwback to when I found you channe...",1,"[found, channel, remember, talking, having, ge..."
3484,911400,694.1605,Assassins Creed III Remastered,2019-03-30,2019-03-30 12:46:04,Assassins Creed origins because I love ancient...,0,"[Creed, love, found, channel]"
3930,911400,694.1605,Assassins Creed III Remastered,2019-03-30,2019-03-30 00:45:50,Oh man the nostalgia ... I remember i found yo...,0,"[remember, found, channel, were, playing]"
84195,548570,2020.519,RAGE 2,2019-05-14,2019-05-19 20:23:25,i found you channel in 2012 i lovve your vids ...,5,"[found, channel, lovve, want]"
84275,548570,2020.519,RAGE 2,2019-05-14,2019-05-18 17:50:03,I found you channel when you did a play throug...,4,"[found, channel, did, made, fall]"
84331,548570,2020.519,RAGE 2,2019-05-14,2019-05-18 08:11:26,Hi Brad! I found you channel in the end of 201...,4,"[found, channel, love, get]"
84386,548570,2020.519,RAGE 2,2019-05-14,2019-05-17 17:52:07,Found ya channel long before your voice got de...,3,"[channel, got, Send]"
84525,548570,2020.519,RAGE 2,2019-05-14,2019-05-16 12:06:53,"Literally only just found ya channel haha, go...",2,"[found, channel]"
84616,548570,2020.519,RAGE 2,2019-05-14,2019-05-15 22:33:44,I’m pretty sure the first vid I ever watched o...,1,"[’m, watched, channel, wa, ’ve, been, watching]"
84640,548570,2020.519,RAGE 2,2019-05-14,2019-05-15 20:06:33,Found channel a year ago,1,[channel]


# Counter

In [82]:
result = {}
for group, groupName in zip((groupA, groupB, groupC), ("A", "B", "C")):
    print(groupName, "--------------------")
    result[groupName] = {}
    for func, tag in zip((is_noun, is_adj, is_verb), ("Noun", "Adj", "Verb")):
        # 토큰화
        pos = group['text'].progress_apply(pos_tagging, args=(func,))
        # 단수화
        pos = pos.progress_apply(token_singularize)
        # 잘못된(1글자) 단어 제거
        pos = pos.progress_apply(remove_single_word)
        
        unpacked = [word for sent in pos for word in sent]
        cnt = Counter(unpacked)
        result[groupName][tag] = cnt

  0%|                                                                 | 63/94181 [00:00<02:30, 626.25it/s]

A --------------------


100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:02<00:00, 515.89it/s]
100%|█████████████████████████████████████████████████████████████| 94181/94181 [00:20<00:00, 4546.69it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 398983.25it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:00<00:00, 522.32it/s]
100%|████████████████████████████████████████████████████████████| 94181/94181 [00:06<00:00, 14505.10it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 495579.25it/s]
100%|██████████████████████████████████████████████████████████████| 94181/94181 [03:00<00:00, 522.54it/s]
100%|█████████████████████████████████████████████████████████████| 94181/94181 [00:12<00:00, 7542.41it/s]
100%|███████████████████████████████████████████████████████████| 94181/94181 [00:00<00:00, 142666.62it/s]
  0%|                                

B --------------------


100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:38<00:00, 534.80it/s]
100%|█████████████████████████████████████████████████████████████| 52811/52811 [00:11<00:00, 4686.25it/s]
100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 343224.45it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:39<00:00, 532.56it/s]
100%|████████████████████████████████████████████████████████████| 52811/52811 [00:03<00:00, 16131.69it/s]
100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 507683.57it/s]
100%|██████████████████████████████████████████████████████████████| 52811/52811 [01:40<00:00, 526.90it/s]
100%|█████████████████████████████████████████████████████████████| 52811/52811 [00:07<00:00, 7255.07it/s]
100%|███████████████████████████████████████████████████████████| 52811/52811 [00:00<00:00, 415742.24it/s]
  0%|                                

C --------------------


100%|████████████████████████████████████████████████████████████| 197473/197473 [06:12<00:00, 530.28it/s]
100%|███████████████████████████████████████████████████████████| 197473/197473 [00:41<00:00, 4784.83it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 411310.13it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [06:13<00:00, 528.41it/s]
100%|██████████████████████████████████████████████████████████| 197473/197473 [00:10<00:00, 18023.41it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 523683.69it/s]
100%|████████████████████████████████████████████████████████████| 197473/197473 [06:13<00:00, 528.18it/s]
100%|███████████████████████████████████████████████████████████| 197473/197473 [00:24<00:00, 7978.25it/s]
100%|█████████████████████████████████████████████████████████| 197473/197473 [00:00<00:00, 445663.60it/s]


In [83]:
# with open('pos_tag_result.pickle', 'wb') as f:
#     pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

In [13]:
with open('pos_tag_result.pickle', 'rb') as f:
    result = pickle.load(f)

In [3]:
result.keys()

dict_keys(['A', 'B', 'C'])

In [4]:
result['A'].keys()

dict_keys(['Noun', 'Adj', 'Verb'])

# Noun

In [5]:
for word, count in result['A']['Noun'].most_common(10):
    print(word, count)

game 31005
video 5024
time 4121
story 3416
character 3269
person 3039
thing 2325
trailer 2265
way 2094
ad 2045


In [6]:
len(result['A']['Noun'].keys())

34914

In [7]:
total_noun_cnt = sum(result['A']['Noun'].values())

In [8]:
total_top_100 =0
for _, count in result['A']['Noun'].most_common(100):
    total_top_100 += count

In [9]:
total_top_100

150514

In [10]:
def get_top_100_percent(group, pos, verbose=True):
    total_top_100 =0
    for word, count in result[group][pos].most_common(100):
        total_top_100 += count
    
    for word, count in result[group][pos].most_common(100):
        if verbose:
            print(word, "%.2f" % (count/total_top_100*100), count)
    
    return total_top_100

# test

In [11]:
shared_pos = ( set([word for word, _ in result["A"]["Noun"].most_common(100)]) 
    & set([word for word, _ in result["B"]["Noun"].most_common(100)])
    & set([word for word, _ in result["C"]["Noun"].most_common(100)]))

In [12]:
a = {}
for group in ["A", "B", "C"]:
    for word, count in result[group]["Noun"].most_common(100):
        if word in shared_pos:
            if word not in a:
                a[word] = [count]
            else:
                a[word].append(count)

In [13]:
def shared_word_percent(pos):
    
    shared_pos = ( set([word for word, _ in result["A"][pos].most_common(100)]) 
    & set([word for word, _ in result["B"][pos].most_common(100)])
    & set([word for word, _ in result["C"][pos].most_common(100)]))
    
    a = {}
    for group in ["A", "B", "C"]:
        total_top_100 =0
        for word, count in result[group][pos].most_common(100):
            total_top_100 += count
        
        for word, count in result[group][pos].most_common(100):
            if word in shared_pos:
                percent = count/total_top_100
                if word not in a:
                    a[word] = [percent*100]
                else:
                    a[word].append(percent*100)
    return a

## Entropy

In [12]:
from scipy.stats import entropy

In [17]:
def word_entropy(pos):
    a = [x for x,y in result["A"][pos].most_common()]
    b = [x for x,y in result["B"][pos].most_common()]
    c = [x for x,y in result["C"][pos].most_common()]
    
    all_word = set(a+b+c)
    
    for word in all_word:
    
        a_cnt = result["A"][pos][word]
        b_cnt = result["B"][pos][word]
        c_cnt = result["C"][pos][word]

        ent = entropy([a_cnt, b_cnt, c_cnt], base=3)

        if (0.01 < ent < 0.5) and sum([a_cnt, b_cnt, c_cnt]) > 100:
            print(word, a_cnt, b_cnt, c_cnt, ent)

In [20]:
word_entropy("Verb")

fear 12 8 114 0.47501919498779804
Sean 0 1 282 0.02136867968413015
mark 7 1 361 0.1025626174562346
tracing 3 7 233 0.1790690149425795
helped 70 93 957 0.4681456036127052
relate 20 9 382 0.27196768103350544
dating 5 9 118 0.3707628987871908
trending 4 4 278 0.13381354751534774
EAT 1 0 229 0.025470481200608422
glitch 10 9 122 0.44468546333254455
freak 10 2 169 0.24924932160528948
Wanted 10 4 133 0.3381267997534391
depressed 6 6 251 0.1975762811821326
shine 11 4 115 0.3864333698448698
ARE 17 12 155 0.4938729476233683
laughing 46 32 581 0.40393843121464523
flip 11 6 204 0.2923065375275199
dancing 16 3 215 0.2886341689847257
controlled 12 7 117 0.451816403669867
Eat 3 1 328 0.06552765382137757
feed 19 3 109 0.4728640857431476
pressed 7 5 165 0.2675637348925419
flash 5 3 205 0.16835220278208732
escape 16 6 364 0.2293926427215555
AM 38 7 215 0.4874695255930076
grind 233 33 6 0.4301914270752025
Quake 6 3 129 0.2572337024555733
jack 2 1 258 0.06378635314905241
screaming 43 14 530 0.339341445188

# shared Noun

In [14]:
for k in shared_word_percent("Noun").keys():
    print(k, *shared_word_percent("Noun")[k])

game 20.59941267921921 18.35313935278403 9.610546478551111
video 3.3378954781615 4.12702172740074 4.5756679102877476
time 2.7379512869234754 3.0807901351546976 3.099403919550581
story 2.2695563203422937 1.2081013857633542 0.4463456123047015
character 2.171890986884941 1.2107334149262374 0.6596909530139943
person 2.0190812814754775 1.7634595391316936 2.5794416570178824
thing 1.544706804682621 1.57526945398555 2.3485855597009593
way 1.391232709249638 1.602905760195823 1.3090183970153093
one 1.2689849449220671 1.0396515193388343 1.3225981674457166
year 1.1248123098183558 1.6568623580349269 0.9995425761539232
lot 1.0331264865726777 1.076499927619198 0.8758951927612677
day 1.005886495608382 1.329174727255978 1.348328258787541
guy 1.002564545490785 1.015963256872886 1.2050259445087697
world 0.9520709037033099 0.8593575216813402 0.5378303815200767
.. 0.9248309127390143 1.1291405108768604 0.7057907000014294
series 0.9228377426684561 1.310750523115796 0.666480838229198
something 0.8836387312808

In [15]:
for k in shared_word_percent("Adj").keys():
    print(k, *shared_word_percent("Adj")[k])

good 6.835231259107346 6.548781741544156 4.684744268077601
first 5.040746937233526 3.924642746992906 3.172059422059422
more 3.6388903880403696 4.083993009149789 5.307115723782391
great 3.2907874143234928 3.3360748432199037 2.222391805725139
new 3.2462626153597065 3.6265035468284155 2.681115181115181
best 3.047924874521021 3.3129433535519692 2.8608736942070276
much 2.6620432835015384 2.8554538912305953 3.3552096052096054
better 2.633709320524583 2.5624550221034235 1.3668430335097002
same 2.587835285228561 2.5675953531407423 1.9213810880477546
other 2.5581520859193696 2.4467975737637504 2.368233618233618
favorite 2.4744994333207404 2.5239025393235326 1.0641364808031475
bad 2.278860165146527 2.290017477125527 1.8510039343372677
original 1.6838469426304712 2.457078235838388 0.6774860941527608
many 1.542177127745696 1.657756759535314 1.8815289648622981
old 1.5030492741108534 1.4778451732291558 1.6008682675349342
only 1.3883641858707971 1.2208286213632158 1.4490910324243658
awesome 1.3559825