In [1]:
import pandas as pd
import numpy as np
from collections import Counter,OrderedDict
import tensorflow as tf
import datetime
from gensim.models import Word2Vec, KeyedVectors
from konlpy.tag import Okt
import random
import copy
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [None]:
def basic_preprocessing(string):
    if string != string or type(string) == type(1) or string == ' ':
        return ''
    else:
        return string
    
DF['교정된 리뷰'] = DF['교정된 리뷰'].apply(basic_preprocessing)

def twitter(string):
    tagged_list = okt.pos(string,norm=True,stem=True) 
    
    if random.uniform(0,1) < 1e-3 and len(tagged_list) >= 5:
        print('--------------------')
        print('교정전 리뷰\n',string)
        print('\n교정후 리뷰\n','/'.join([x for x,y in tagged_list if y == 'Noun']))
        
    return [x for x,y in tagged_list if y == 'Noun']

tagged_review = DF['교정된 리뷰'].apply(twitter)

tagged_review.apply(lambda x : '/'.join(x)).to_csv('명사_형태소분석.csv')

In [2]:
DF = pd.read_csv('labeled_df.csv')

review_str_noun = pd.read_csv('명사_형태소분석.csv')

In [3]:
review_str_noun=review_str_noun.drop('0',axis=1)

In [4]:
review_str_noun.head(10)

Unnamed: 0,Unnamed: 1
0,
1,
2,점/만점/점
3,우파/루크/로스/개/보석
4,
5,
6,계속/다운/리소스/파일/접속/다시/설치/해도
7,
8,호루/근대/또/루루/좀
9,


In [5]:
DF

Unnamed: 0.1,Unnamed: 0,평점,시간,이름,timestamp,교정된 리뷰,length,is_good
0,181637,5,2018-11-25,우파루마운틴 for Kakao,1543071600,,1,False
1,181638,4,2018-11-25,우파루마운틴 for Kakao,1543071600,좋음,2,False
2,181639,4,2018-11-25,우파루마운틴 for Kakao,1543071600,재미있어요,5,False
3,181640,5,2018-11-25,우파루마운틴 for Kakao,1543071600,Num 점만점에 Num 점,10,False
4,181641,5,2018-11-25,우파루마운틴 for Kakao,1543071600,"우파루크로스를개말고많이해주셨음 좋겠구요 , 보석도 많이주셨으면 좋겠어요",39,False
5,181642,5,2018-11-25,우파루마운틴 for Kakao,1543071600,재밌네요,4,False
6,181643,5,2018-11-25,우파루마운틴 for Kakao,1543071600,좋아오,3,False
7,181644,3,2018-11-25,우파루마운틴 for Kakao,1543071600,계속 다운받은 리소스 파일이 잘못되었다면서 접속이 안되네요 다시 설치 해도 안되는데...,56,False
8,181645,5,2018-11-25,우파루마운틴 for Kakao,1543071600,Num,5,False
9,181646,5,2018-11-25,우파루마운틴 for Kakao,1543071600,호루 !! 근대 또 루루 어떻게 만드는지좀 Elip,24,False


In [6]:
def str_to_list(string):
    
    if string != string or len(string) == 0:
        return []
    else:
        return string.split('/')

tagged_review = review_str_noun[review_str_noun.columns[0]].apply(str_to_list)

In [7]:
tagged_review

0                                                        []
1                                                        []
2                                                [점, 만점, 점]
3                                       [우파, 루크, 로스, 개, 보석]
4                                                        []
5                                                        []
6                         [계속, 다운, 리소스, 파일, 접속, 다시, 설치, 해도]
7                                                        []
8                                        [호루, 근대, 또, 루루, 좀]
9                                                        []
10                                           [캐릭터, 소환, 좋아욘]
11                                                       []
12                                    [처음, 헤이데, 예전, 게임, 정말]
13                       [리소스, 파일, 문제, 계속, 튕기네, 덕분, 게임, 시작]
14                                            [인내심, 도움, 짱짱]
15                                                  [무조, 코]
16                                      

In [8]:
review_model = Word2Vec(tagged_review, size=200, window = 5, workers=6, iter=10, sg=1)

review_vector = review_model.wv
review_vocab = review_vector.vocab

# 카운팅 객체(나중에 쓸거임)
w2c_review = dict()
for item in sorted(review_vocab,key=review_vocab.get,reverse=True):
    w2c_review[item]=review_vocab[item].count  

In [9]:
noun_freq = pd.DataFrame.from_dict(w2c_review,orient='index')

noun_freq.to_excel('명사빈도순위.xlsx')

In [10]:
noun_freq

# 이걸 기반으로 분석할 명사들을 선정함

with open('game_nouns','r') as fileobj:
    string = fileobj.read()

In [11]:
keywords = string.split('\n')[:-1]

In [12]:
keywords_freq = noun_freq.loc[keywords]

In [13]:
keywords_freq

Unnamed: 0,0
게임,80629
겜,17301
시간,14314
현질,12369
업데이트,12184
재미,12061
유저,10627
사람,7294
접속,10445
돈,8629


In [14]:
# 이걸로 idf를 만들면됨.
keywords_freq.head(10)
# idf를 만들기 위해서 total count를 계산하자.
total_count = keywords_freq.sum()[0]
print("total count\n",total_count)
# idf 만들기
idf_dict = np.log(total_count/keywords_freq)[0].to_dict()
idf_dict

total count
 344183


{'게임': 1.4513151063797365,
 '겜': 2.990409189366734,
 '시간': 3.1799354133517133,
 '현질': 3.3259801502752464,
 '업데이트': 3.3410498770752586,
 '재미': 3.3511963860670133,
 '유저': 3.477775560277561,
 '사람': 3.8541214002816306,
 '접속': 3.49505009765959,
 '돈': 3.6860448691419485,
 '계정': 3.721670245414437,
 '과금': 3.8532991445137736,
 '캐릭터': 3.8975516741308183,
 '이벤트': 3.979887960791454,
 '렉': 3.9930336900039567,
 '오류': 4.009712656588512,
 '버그': 4.49336011836651,
 '플레이': 4.108103019788744,
 '그래픽': 4.1569992341200015,
 '보상': 4.178194813305989,
 '서버': 4.185424677370769,
 '레벨': 4.208019053616703,
 '광고': 4.211148946625631,
 '삭제': 4.223172349573531,
 '화면': 4.253367880361132,
 '폰': 4.264258771939579,
 '로딩': 4.301728952054554,
 '유도': 4.316204736860469,
 '설치': 4.392373926196831,
 '해결': 4.409905765905497,
 '문제': 4.418065158425512,
 '로그인': 4.425563077214176,
 '다운': 4.432628522613409,
 '결제': 4.48225032834436,
 '스토리': 4.501708718905027,
 '실행': 4.5122434489378,
 '아이템': 4.550564381682636,
 '복구': 4.55194084439136,
 '

In [15]:
keyword_index = {y:x for x,y in enumerate(idf_dict.keys())}
inv_dict = list(idf_dict.keys())

In [16]:
def extract(words):
    
    return [w for w in words if w in keywords]

extract_nouns = tagged_review.apply(extract)

In [17]:
extract_nouns.head(10)

0              []
1              []
2              []
3              []
4              []
5              []
6    [다운, 접속, 설치]
7              []
8              []
9              []
Name: Unnamed: 1, dtype: object

In [18]:
extract_nouns_str = extract_nouns.apply(lambda x :
                                       '/'.join([str(keyword_index[w]) for w in x]))

DF['문자열'] = extract_nouns_str

In [19]:
COPY = DF[['문자열','이름','is_good']]

GROUP_BY_GAME = COPY.groupby(['이름'])

keys = list(GROUP_BY_GAME.groups.keys())
indices = list(GROUP_BY_GAME.groups.values())

DFs = [COPY.loc[indices[x]] for x in range(len(indices))]

In [20]:
DFs[0]

Unnamed: 0,문자열,이름,is_good
196019,38,2018갓오브하이스쿨 with NAVER WEBTOON,False
196020,10,2018갓오브하이스쿨 with NAVER WEBTOON,False
196021,,2018갓오브하이스쿨 with NAVER WEBTOON,False
196022,0,2018갓오브하이스쿨 with NAVER WEBTOON,False
196023,,2018갓오브하이스쿨 with NAVER WEBTOON,False
196024,,2018갓오브하이스쿨 with NAVER WEBTOON,False
196025,1,2018갓오브하이스쿨 with NAVER WEBTOON,False
196026,13,2018갓오브하이스쿨 with NAVER WEBTOON,False
196027,,2018갓오브하이스쿨 with NAVER WEBTOON,False
196028,,2018갓오브하이스쿨 with NAVER WEBTOON,False


In [21]:
import operator

def basic_preprocessing(string):
    if string != string or type(string) == type(1) or string == ' ':
        return ''
    else:
        return string

df_list = []
for df in DFs:
    
    label = df['is_good'].values[0]
    game_name = df['이름'].values[0]
    print(game_name,':',label)
    
    df['문자열'] = df['문자열'].apply(basic_preprocessing)
    tokens = '/'.join(df['문자열']).split('/')
    Count_review = Counter([x for x in tokens if len(x) != 0])
    tf_idf = {int(x):idf_dict[inv_dict[int(x)]]*np.log(y+1) for x,y in Count_review.items()}
    sorted_tfidf = sorted(tf_idf.items(), key=operator.itemgetter(0))
    sorted_tfidf = [(inv_dict[x],y) for x,y in sorted_tfidf] + [('is_good',label)]
    df_ = pd.DataFrame(sorted_tfidf,columns=[0,game_name])
    df_ = df_.set_index(0).T
    df_list.append(df_)
    
full_vector = pd.concat(df_list)

2018갓오브하이스쿨 with NAVER WEBTOON : False
FINAL FANTASY BRAVE EXVIUS : False
Idle Heroes - 아이들 히어로즈 : False
Mobile Strike : False
Sdorica - sunset - (스도리카 -선셋-) : False
War Machines: 탱크 게임 - 무료 : False
Yu-Gi-Oh! Duel Links : False
 Jurassic World™: The Game : False
검과마법 for Kakao : False
권력:THERULERS : False
낚시의 신 : False
냥코 대전쟁 : False
넷마블 포커 - 바카라, 7포커, 로우바둑이, 뉴포커 : True
노블레스 with NAVER WEBTOON : False
놀러와 마이홈 : False
다크어벤저3 : False
더 킹 오브 파이터즈98 UM온라인 for kakao : False
도미네이션즈 : False
라스트 쉘터 (Last Shelter: Survival) : True
라스트 엠파이어 워 Z : False
레알팜 : False
레이븐: SIGN : False
로드 모바일: 제국의 전쟁 - MMORPG : True
리니지2 레볼루션 : True
리니지M : True
마스터 오브 이터니티 : False
마피아 시티 : True
매직 러쉬:히어로즈(Magic Rush: Heroes) : False
메이플스토리M : False
모두의마블 for kakao : True
몬스터 길들이기 for kakao : False
몬스터슈퍼리그 : False
뮤오리진 : False
반지 : False
별이되어라! : False
사커스피리츠 : False
삼국블레이드 : False
삼국지 조조전 Online : False
섀도우버스(Shadowverse) : False
서머너즈 워: 천공의 아레나 : True
세븐나이츠 for Kakao : True
소녀전선 : False
소드 아트 온라인　-메모리 디프래그- : False

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [22]:
labels, vector = full_vector['is_good'], full_vector.drop('is_good',axis=1)
vector = vector.fillna(4)

In [23]:
labels = labels.astype('int')

In [24]:
from sklearn import tree

In [58]:
clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5,max_leaf_nodes=8)
clf = clf.fit(vector, labels.values)

In [59]:
from sklearn.metrics import accuracy_score

y_pred_tr = clf.predict(vector)
print('Accuracy: %.2f' % accuracy_score(labels.values, y_pred_tr))

Accuracy: 0.97


In [60]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

graph = tree.export_graphviz(clf, out_file='tree.dot'
   , feature_names=full_vector.columns[1:], class_names=['망겜','흥겜'] 
   , filled = True, rounded = True, proportion = False)

In [57]:
import pydotplus
graph = pydotplus.graph_from_dot_data(graph)

TypeError: object of type 'NoneType' has no len()

In [46]:
from IPython.display import Image
Image(graph.create_png())

InvocationException: GraphViz's executables not found