# Kakao Arena 2회 대회 : 브런치 사용자를 위한 글 추천 대회

## 데이터 EDA ipython notebook

In [1]:
from collections import Counter
from datetime import timedelta, datetime
import glob
from itertools import chain
import json
import os
import re

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import seaborn as sns

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

In [4]:
font_path = 'NanumGothic.ttf'
font_name = fm.FontProperties(fname=font_path, size=10).get_name()
plt.rc('font', family=font_name, size=12)
plt.rcParams["figure.figsize"] = (20, 10)
register_matplotlib_converters()

In [5]:
directory = 'data/'

## 1. Data Read

### a. Magazine.json

In [6]:
magazine = pd.read_json(directory + 'magazine.json',lines = True)

In [7]:
magazine.shape

(27967, 2)

In [8]:
magazine.head()

Unnamed: 0,id,magazine_tag_list
0,38842,"[브런치북, 육아일기, 대화법, 들려주고픈이야기]"
1,11540,"[tea, food]"
2,11541,[food]
3,11546,"[브런치북, 일상, 시, 사람]"
4,11544,"[감성에세이, 노래, 음악에세이]"


In [9]:
magazine.columns = ['magazine_id','magazine_tag_list']

In [10]:
magazine.head()

Unnamed: 0,magazine_id,magazine_tag_list
0,38842,"[브런치북, 육아일기, 대화법, 들려주고픈이야기]"
1,11540,"[tea, food]"
2,11541,[food]
3,11546,"[브런치북, 일상, 시, 사람]"
4,11544,"[감성에세이, 노래, 음악에세이]"


### b. Metadata.json

In [11]:
metadata = pd.read_json(directory + 'metadata.json', lines = True)

In [12]:
metadata.shape

(643104, 9)

In [13]:
metadata.head()

Unnamed: 0,article_id,display_url,id,keyword_list,magazine_id,reg_ts,sub_title,title,user_id
0,782,https://brunch.co.kr/@bookdb/782,@bookdb_782,"[여행, 호주, 국립공원]",8982,1474944427000,세상 어디에도 없는 호주 Top 10,"사진으로 옮기기에도 아까운, 리치필드 국립공원",@bookdb
1,81,https://brunch.co.kr/@kohwang56/81,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,1463092749000,,[시] 서러운 봄,@kohwang56
2,4,https://brunch.co.kr/@hannahajink/4,@hannahajink_4,[],0,1447997287000,무엇 때문에,무엇을 위해,@hannahajink
3,88,https://brunch.co.kr/@bryceandjuli/88,@bryceandjuli_88,"[감정, 마음, 위로]",16315,1491055161000,,싫다,@bryceandjuli
4,34,https://brunch.co.kr/@mijeongpark/34,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,1523292942000,#7. 내 친구의 집은 어디인가,Dubliner#7,@mijeongpark


In [14]:
metadata2 = metadata.copy()

In [15]:
metadata2['reg_datetime'] = metadata2['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0))
metadata2.loc[metadata2['reg_datetime'] == metadata2['reg_datetime'].min(), 'reg_datetime'] = datetime(2090, 12, 31)
metadata2['reg_dt'] = metadata2['reg_datetime'].dt.date
metadata2['type'] = metadata2['magazine_id'].apply(lambda x : '개인' if x == 0.0 else '매거진')

In [16]:
metadata2.columns = ['id', 'display_url', 'article_id', 'keyword_list', 'magazine_id', 'reg_ts', 'sub_title', 'title', 'author_id', 'reg_datetime', 'reg_dt', 'type']

In [17]:
metadata2.head()

Unnamed: 0,id,display_url,article_id,keyword_list,magazine_id,reg_ts,sub_title,title,author_id,reg_datetime,reg_dt,type
0,782,https://brunch.co.kr/@bookdb/782,@bookdb_782,"[여행, 호주, 국립공원]",8982,1474944427000,세상 어디에도 없는 호주 Top 10,"사진으로 옮기기에도 아까운, 리치필드 국립공원",@bookdb,2016-09-27 11:47:07,2016-09-27,매거진
1,81,https://brunch.co.kr/@kohwang56/81,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,1463092749000,,[시] 서러운 봄,@kohwang56,2016-05-13 07:39:09,2016-05-13,매거진
2,4,https://brunch.co.kr/@hannahajink/4,@hannahajink_4,[],0,1447997287000,무엇 때문에,무엇을 위해,@hannahajink,2015-11-20 14:28:07,2015-11-20,개인
3,88,https://brunch.co.kr/@bryceandjuli/88,@bryceandjuli_88,"[감정, 마음, 위로]",16315,1491055161000,,싫다,@bryceandjuli,2017-04-01 22:59:21,2017-04-01,매거진
4,34,https://brunch.co.kr/@mijeongpark/34,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,1523292942000,#7. 내 친구의 집은 어디인가,Dubliner#7,@mijeongpark,2018-04-10 01:55:42,2018-04-10,매거진


#### metadata + magazine

In [18]:
meta_magazine = pd.merge(metadata2,magazine,how='left',left_on='magazine_id',right_on='magazine_id')

In [19]:
meta_magazine.head()

Unnamed: 0,id,display_url,article_id,keyword_list,magazine_id,reg_ts,sub_title,title,author_id,reg_datetime,reg_dt,type,magazine_tag_list
0,782,https://brunch.co.kr/@bookdb/782,@bookdb_782,"[여행, 호주, 국립공원]",8982,1474944427000,세상 어디에도 없는 호주 Top 10,"사진으로 옮기기에도 아까운, 리치필드 국립공원",@bookdb,2016-09-27 11:47:07,2016-09-27,매거진,"[책, 독서, 독서에세이]"
1,81,https://brunch.co.kr/@kohwang56/81,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,1463092749000,,[시] 서러운 봄,@kohwang56,2016-05-13 07:39:09,2016-05-13,매거진,"[사랑, 행복, 이별]"
2,4,https://brunch.co.kr/@hannahajink/4,@hannahajink_4,[],0,1447997287000,무엇 때문에,무엇을 위해,@hannahajink,2015-11-20 14:28:07,2015-11-20,개인,[]
3,88,https://brunch.co.kr/@bryceandjuli/88,@bryceandjuli_88,"[감정, 마음, 위로]",16315,1491055161000,,싫다,@bryceandjuli,2017-04-01 22:59:21,2017-04-01,매거진,"[생각, 일기, 이야기]"
4,34,https://brunch.co.kr/@mijeongpark/34,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,1523292942000,#7. 내 친구의 집은 어디인가,Dubliner#7,@mijeongpark,2018-04-10 01:55:42,2018-04-10,매거진,"[브런치북, 여행, 에세이, 시]"


#### meta_magazine 쓸모없는 변수 제거

In [20]:
meta_magazine2 = meta_magazine[['article_id','keyword_list','magazine_id','author_id','reg_dt','type','magazine_tag_list']]

In [21]:
meta_magazine2.head()

Unnamed: 0,article_id,keyword_list,magazine_id,author_id,reg_dt,type,magazine_tag_list
0,@bookdb_782,"[여행, 호주, 국립공원]",8982,@bookdb,2016-09-27,매거진,"[책, 독서, 독서에세이]"
1,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,@kohwang56,2016-05-13,매거진,"[사랑, 행복, 이별]"
2,@hannahajink_4,[],0,@hannahajink,2015-11-20,개인,[]
3,@bryceandjuli_88,"[감정, 마음, 위로]",16315,@bryceandjuli,2017-04-01,매거진,"[생각, 일기, 이야기]"
4,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,@mijeongpark,2018-04-10,매거진,"[브런치북, 여행, 에세이, 시]"


In [23]:
meta_magazine2['tag_list'] = meta_magazine2['keyword_list'] + meta_magazine2['magazine_tag_list']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [24]:
meta_magazine2.head()

Unnamed: 0,article_id,keyword_list,magazine_id,author_id,reg_dt,type,magazine_tag_list,tag_list
0,@bookdb_782,"[여행, 호주, 국립공원]",8982,@bookdb,2016-09-27,매거진,"[책, 독서, 독서에세이]","[여행, 호주, 국립공원, 책, 독서, 독서에세이]"
1,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,@kohwang56,2016-05-13,매거진,"[사랑, 행복, 이별]","[목련꽃, 아지랑이, 동행, 사랑, 행복, 이별]"
2,@hannahajink_4,[],0,@hannahajink,2015-11-20,개인,[],[]
3,@bryceandjuli_88,"[감정, 마음, 위로]",16315,@bryceandjuli,2017-04-01,매거진,"[생각, 일기, 이야기]","[감정, 마음, 위로, 생각, 일기, 이야기]"
4,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,@mijeongpark,2018-04-10,매거진,"[브런치북, 여행, 에세이, 시]","[유럽여행, 더블린, 아일랜드, 브런치북, 여행, 에세이, 시]"


In [29]:
meta_maga = meta_magazine2[['article_id','author_id','reg_dt','type','tag_list']]

In [30]:
meta_maga.head()

Unnamed: 0,article_id,author_id,reg_dt,type,tag_list
0,@bookdb_782,@bookdb,2016-09-27,매거진,"[여행, 호주, 국립공원, 책, 독서, 독서에세이]"
1,@kohwang56_81,@kohwang56,2016-05-13,매거진,"[목련꽃, 아지랑이, 동행, 사랑, 행복, 이별]"
2,@hannahajink_4,@hannahajink,2015-11-20,개인,[]
3,@bryceandjuli_88,@bryceandjuli,2017-04-01,매거진,"[감정, 마음, 위로, 생각, 일기, 이야기]"
4,@mijeongpark_34,@mijeongpark,2018-04-10,매거진,"[유럽여행, 더블린, 아일랜드, 브런치북, 여행, 에세이, 시]"


In [31]:
meta_maga.tail()

Unnamed: 0,article_id,author_id,reg_dt,type,tag_list
643099,@uxstar_24,@uxstar,2019-03-25,매거진,"[3D, UI, 제스처, IT, UX, UI]"
643100,@reading15m_575,@reading15m,2018-10-31,매거진,"[독서모임, 경험수집, 글쓰기, 경험, 기회, 잡화점]"
643101,@hje3884_118,@hje3884,2017-11-06,매거진,"[생각, 에세이, 괴로움, 브런치북, 사랑, 에세이, 감성에세이]"
643102,@julieleekgep_12,@julieleekgep,2018-10-31,매거진,"[여행, 유럽여행, 리스본, 유럽여행, 포르투갈, 포르투갈여행]"
643103,@julieleekgep_13,@julieleekgep,2018-11-01,매거진,"[리스본, 여행, 유럽, 유럽여행, 포르투갈, 포르투갈여행]"


In [43]:
import datetime

In [49]:
meta_maga['reg_dt'][0]

datetime.date(2016, 9, 27)

In [50]:
dt1 = datetime.date(2090,12,31)
dt1

datetime.date(2090, 12, 31)

In [51]:
sum(meta_maga['reg_dt'] == dt1)

129

### c. Users.json

In [15]:
users = pd.read_json(directory + 'users.json', lines = True)

In [16]:
users.shape

(310758, 3)

In [75]:
users.head(10)

Unnamed: 0,following_list,user_id,keyword_list2
0,"[@perytail, @brunch]",#901985d8bc4c481805c4a4f911814c4a,[]
1,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",#1fd89e9dcfa64b45020d9eaca54e0eed,[]
2,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",#1d94baaea71a831e1f33e1c6bd126ed5,[]
3,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",#04641c01892b12dc018b1410e4928c0d,[]
4,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",#65bcaff862aadff877e461f54187ab62,[]
5,"[@jumi710, @hana8277, @katarun, @brunch3woz, @...",#1a2b23b6332137193be79d297409befb,[]
6,"[@gabrieljmh, @megaonic, @cleancode, @simu-loo...",#d07b31cfd62b7097837eee6b8328e077,"[{'cnt': 1, 'keyword': '200일 이벤트'}, {'cnt': 2,..."
7,"[@potatohands, @ggpodori, @chae-pulib, @roysda...",#a6f7a5ff90a19ec4d583f0db1836844d,[]
8,"[@rmk011, @unitasbrand, @libraryman, @thewater...",#13b3009a8698e9d5e892534d9dcdac62,[]
9,"[@megustastu, @hongmilmil, @keeuyo, @21mission...",#ba94e11de7e31b606d7a1c8051dba02e,[]


In [46]:
users.columns = ['following_list','user_id','keyword_list2']

In [47]:
users.head()

Unnamed: 0,following_list,user_id,keyword_list2
0,"[@perytail, @brunch]",#901985d8bc4c481805c4a4f911814c4a,[]
1,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",#1fd89e9dcfa64b45020d9eaca54e0eed,[]
2,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",#1d94baaea71a831e1f33e1c6bd126ed5,[]
3,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",#04641c01892b12dc018b1410e4928c0d,[]
4,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",#65bcaff862aadff877e461f54187ab62,[]


In [77]:
len(users['user_id'].unique())

310758

### d. Read Files

In [52]:
read_file_lst = glob.glob('data/read/*')

In [54]:
exclude_file_lst = ['read.tar']

In [133]:
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        first = pd.DataFrame(df_temp['raw'].str.split(' #').str[0])
        second = pd.DataFrame('#' + df_temp['raw'].str.split(' #').str[1][df_temp['raw'].str.split(' #').str[1].notnull()])
        df_temp = pd.DataFrame()
        df_temp = df_temp.append(first)
        df_temp = df_temp.append(second)
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)

In [134]:
read = pd.concat(read_df_lst)

In [135]:
read.shape

(3548447, 5)

In [136]:
read.head()

Unnamed: 0,raw,dt,hr,user_id,article_id
0,#e208be4ffea19b1ceb5cea2e3c4dc32c @kty0613_91,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91
1,#0a3d493f3b2318be80f391eaa00bfd1c @miamiyoung_...,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31 @banksalad_49 @rlfrjsdn_95 @rea...
2,#b90d3ee7ed0d7d827aae168e159749f1 @joeunha_4 @...,20181001,0,#b90d3ee7ed0d7d827aae168e159749f1,@joeunha_4 @yoonvi_3
3,#b8b9d09fe2961fd62edc94912bf75a90 @hyejinchoi_...,20181001,0,#b8b9d09fe2961fd62edc94912bf75a90,@hyejinchoi_122 @hyejinchoi_86 @hyejinchoi_42 ...
4,#072f742eda9359cdac03ad080193c11d @doyeonsunim...,20181001,0,#072f742eda9359cdac03ad080193c11d,@doyeonsunim_240 @k52524_297 @bule13_33 @kwong...


In [137]:
read.tail()

Unnamed: 0,raw,dt,hr,user_id,article_id
606,#a46c7024b3804ab1f4fd549e4d9448bb_45,20190228,23,#a46c7024b3804ab1f4fd549e4d9448bb_45,
675,#6eb349ad62c019fa367d42949288d1dc_3,20190228,23,#6eb349ad62c019fa367d42949288d1dc_3,
755,#d17aad82f76714b2c6c5938c1b795d28_155 @haksook...,20190228,23,#d17aad82f76714b2c6c5938c1b795d28_155,@haksookim_90 @haksookim_91 @gompang_121 @jint...
786,#c312c051de62857bdb28fcc981141c56_17,20190228,23,#c312c051de62857bdb28fcc981141c56_17,
815,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19 @pjsprau_...,20190228,23,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19,@pjsprau_116 @bookfit_4206 @dailylife_420 @hou...


#### 탐색하기 좋은 데이터 포맷으로 변경

In [142]:
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

In [143]:
read_cnt_by_user = read['article_id'].str.split(' ').map(len)

In [144]:
read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

In [145]:
read_raw.shape

(21754791, 4)

In [146]:
read_raw.head()

Unnamed: 0,dt,hr,user_id,article_id
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140


In [32]:
#print("전체 데이터 건수:", read_raw.shape)
#print("중복 소비를 제외한 데이터 건수:", read_raw[['user_id', 'article_id']].drop_duplicates().shape)
#print("Unique 독자 수:", len(read_raw['user_id'].unique()))
#print("소비된 Unique 글 수:", len(read_raw['article_id'].unique()))

전체 데이터 건수: (22110706, 4)
중복 소비를 제외한 데이터 건수: (12597878, 2)
Unique 독자 수: 306222
소비된 Unique 글 수: 505841


In [147]:
print("전체 데이터 건수:", read_raw.shape)
print("중복 소비를 제외한 데이터 건수:", read_raw[['user_id', 'article_id']].drop_duplicates().shape)
print("Unique 독자 수:", len(read_raw['user_id'].unique()))
print("소비된 Unique 글 수:", len(read_raw['article_id'].unique()))

전체 데이터 건수: (21754791, 4)
중복 소비를 제외한 데이터 건수: (12431607, 2)
Unique 독자 수: 310497
소비된 Unique 글 수: 498438


In [148]:
read_raw.head()

Unnamed: 0,dt,hr,user_id,article_id
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140


In [149]:
read_raw.tail()

Unnamed: 0,dt,hr,user_id,article_id
786,20190228,23,#c312c051de62857bdb28fcc981141c56_17,
815,20190228,23,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19,@pjsprau_116
815,20190228,23,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19,@bookfit_4206
815,20190228,23,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19,@dailylife_420
815,20190228,23,#d6a2d0cfebc30014e9b0d9e82c21e6a9_19,@houseggumiki_88


In [150]:
read_raw[read_raw.article_id == ''].head()

Unnamed: 0,dt,hr,user_id,article_id
28,20181001,0,#db7acb1f50e54b8f78344803fb1eb5cf,
31,20181001,0,#7411fe2ebde59b981f7b9e22c153b3bb,
117,20181001,0,#3dd58f2ad342f93e7650ca5d159f64b5,
124,20181001,0,#e3c6b5ce0f1834cd932d395bd86a3d13,
134,20181001,0,#a575890bcdff2bc22e7ef731d6766809,


In [151]:
read_raw2 = read_raw[read_raw.article_id != '']

In [152]:
print("전체 데이터 건수:", read_raw2.shape)
print("중복 소비를 제외한 데이터 건수:", read_raw2[['user_id', 'article_id']].drop_duplicates().shape)
print("Unique 독자 수:", len(read_raw2['user_id'].unique()))
print("소비된 Unique 글 수:", len(read_raw2['article_id'].unique()))

전체 데이터 건수: (21716610, 4)
중복 소비를 제외한 데이터 건수: (12417455, 2)
Unique 독자 수: 308532
소비된 Unique 글 수: 498437


### e. Contents - data.5/data.6

In [51]:
data5 = pd.read_json(directory + 'data.5', lines = True)

In [52]:
data5.shape

(100000, 3)

In [53]:
data5.columns = ['chars','article_id','morphs']

In [54]:
data5.head()

Unnamed: 0,chars,article_id,morphs
0,"[[362/SW], [166+175+169+130+133+171/SL], [448+...",@kmug_79,"[[397/SW], [46800/SL], [16782/NNG, 117/JC], [6..."
1,"[[116+87/NNG, 335/NNP, 47/JKG], [18+393/NNG, 1...",@keentoknow_18,"[[2719/NNG, 356/NNP, 38/JKG], [734/NNG, 7/JX],..."
2,"[[1003+4+480/NNP], [87+307+202+128/NNP, 9/JX],...",@elara1020_108,"[[30798/NNP], [509511/NNP, 5/JX], [291245/NNP]..."
3,"[[122+313+312+121/SN, 316/NNB], [120/SN, 442/N...",@skumac_297,"[[2126/SN, 324/NNB], [1475/SN, 1478/NNB], [413..."
4,"[[347+371/NNG], [15/VV, 16/ETM], [17/NNG, 9/JX...",@haese-kang_96,"[[814/NNG], [9/VV, 10/ETM], [11/NNG, 5/JX], [4..."


In [34]:
data6 = pd.read_json(directory + 'data.6', lines = True)

In [35]:
data6.shape

(42190, 3)

In [36]:
data6.head()

Unnamed: 0,chars,id,morphs
0,"[[529/SS], [78+430/NP, 260+80/JX], [33+23/VV, ...",@volo_141,"[[817/SS], [816/NP, 248/JX], [1404/VV, 65/EC, ..."
1,"[[529/SS], [527+217+294/NNG, 61/XSN, 50/JKO], ...",@hansalt58_259,"[[817/SS], [16519/NNG, 48/XSN, 40/JKO], [533/V..."
2,"[[118/NP, 5/JKS], [336+27/NNG, 94+391/NNG, 62/...",@hago_63,"[[209/NP, 35/JKS], [1201/NNG, 9472/NNG, 50/JKO..."
3,"[[312+567/SN, 43/NNB], [43+188+594/NNG], [392+...",@elni99_82,"[[4141/SN, 2091/NNB], [1105/NNG], [2498/NNG, 7..."
4,"[[383/SW], [383/SW], [383/SW], [362/SW, 362+36...",@2econdlife_137,"[[437/SW], [437/SW], [437/SW], [397/SW, 398/SW..."


In [48]:
data6.columns = ['chars','article_id','morphs']

In [49]:
data6.head()

Unnamed: 0,chars,article_id,morphs
0,"[[529/SS], [78+430/NP, 260+80/JX], [33+23/VV, ...",@volo_141,"[[817/SS], [816/NP, 248/JX], [1404/VV, 65/EC, ..."
1,"[[529/SS], [527+217+294/NNG, 61/XSN, 50/JKO], ...",@hansalt58_259,"[[817/SS], [16519/NNG, 48/XSN, 40/JKO], [533/V..."
2,"[[118/NP, 5/JKS], [336+27/NNG, 94+391/NNG, 62/...",@hago_63,"[[209/NP, 35/JKS], [1201/NNG, 9472/NNG, 50/JKO..."
3,"[[312+567/SN, 43/NNB], [43+188+594/NNG], [392+...",@elni99_82,"[[4141/SN, 2091/NNB], [1105/NNG], [2498/NNG, 7..."
4,"[[383/SW], [383/SW], [383/SW], [362/SW, 362+36...",@2econdlife_137,"[[437/SW], [437/SW], [437/SW], [397/SW, 398/SW..."


### f. test.users

In [91]:
test = pd.read_csv(directory + 'predict/test.users', header = None,names = ['user_id'])

In [92]:
test.head()

Unnamed: 0,user_id
0,#7ee14df8642a7925b1465ff5c89efe5b
1,#8420b9385b282028eebf1ad6b4a221c0
2,#c9b31d8b64357f5854b1ba55b32eb6d3
3,#9bb1e13b5481fa3737af20870b25c723
4,#37d5f99a7f12c9ba90c4e2ac92e54ab6


In [93]:
test.tail()

Unnamed: 0,user_id
4995,#47edae20557b7cf114b9dbcbd28f93be
4996,#c57f417d8e8e29f7f147fff6a2a0efd1
4997,#c9862af1f4299a9c76b6727880bc1277
4998,#634cea2ad0b7c810392347d8cad27ab1
4999,#66686ed88efe783fe0a6b8f69fa36a49


In [94]:
test2 = pd.merge(test,users,how='left',left_on='user_id',right_on='user_id')

In [95]:
test2.head()

Unnamed: 0,user_id,following_list,keyword_list2
0,#7ee14df8642a7925b1465ff5c89efe5b,"[@avecrhae, @gongdae-unni, @jhcharm1, @travel-...",[]
1,#8420b9385b282028eebf1ad6b4a221c0,"[@jhj3211, @ohuk2011, @heat0508, @sssfriend, @...",[]
2,#c9b31d8b64357f5854b1ba55b32eb6d3,"[@marso123, @forhappywomen, @kimpyogo, @brunch]",[]
3,#9bb1e13b5481fa3737af20870b25c723,"[@moonjakga, @aemae-human, @tamarorim, @syshin...",[]
4,#37d5f99a7f12c9ba90c4e2ac92e54ab6,"[@oms1225, @dryjshin, @roysday, @hanuuri, @cyy...",[]


In [98]:
test2.tail()

Unnamed: 0,user_id,following_list,keyword_list2
4995,#47edae20557b7cf114b9dbcbd28f93be,"[@iamlived86, @mnb4237, @brunch]",[]
4996,#c57f417d8e8e29f7f147fff6a2a0efd1,"[@ryangoon, @brunch]",[]
4997,#c9862af1f4299a9c76b6727880bc1277,"[@sehoon0311, @kakao-it, @alicemelbourne, @bud...",[]
4998,#634cea2ad0b7c810392347d8cad27ab1,"[@msra81, @honeytip, @alicemelbourne, @hotelsc...",[]
4999,#66686ed88efe783fe0a6b8f69fa36a49,"[@jade, @brunch]",[]


In [101]:
test2[test2['following_list'].isnull()]

Unnamed: 0,user_id,following_list,keyword_list2
579,#28f6ed60737aafe587e3ec4eb53b23db,,
943,#f6317132d17cbc7f365514118ab68ec5,,
1504,#4dfad4d56fa3863b75e8907ff5f8a149,,
1819,#6b3f9ae509f7ce1925b7db79cdc50bfb,,
1847,#270f067fec220d7ab9e36ba69f8b94d4,,
2449,#053e31c7b0443fc51b2dad5703fa43b8,,
3034,#95486f4120b5c8cbc1163e136d83cf5f,,
3128,#401a3c8ac295dc84a7f368d77aab03ab,,
3280,#d2d09d0a95c6929604fe9d2501e441b1,,
4426,#1355cc9f0fbee79c47b9e748d4718717,,


In [117]:
test2[test2['following_list'].isnull()]['user_id']

579     #28f6ed60737aafe587e3ec4eb53b23db
943     #f6317132d17cbc7f365514118ab68ec5
1504    #4dfad4d56fa3863b75e8907ff5f8a149
1819    #6b3f9ae509f7ce1925b7db79cdc50bfb
1847    #270f067fec220d7ab9e36ba69f8b94d4
2449    #053e31c7b0443fc51b2dad5703fa43b8
3034    #95486f4120b5c8cbc1163e136d83cf5f
3128    #401a3c8ac295dc84a7f368d77aab03ab
3280    #d2d09d0a95c6929604fe9d2501e441b1
4426    #1355cc9f0fbee79c47b9e748d4718717
4631    #97beaa40c9fe59d955a1fb4a802d1605
4689    #3347311c4059a34fe7a576e28ac10218
Name: user_id, dtype: object

In [115]:
tmp = list(users['user_id'])

In [123]:
len(tmp)

310758

In [124]:
tmp2 = 0

In [125]:
for i in range(0,310758):
    tmp2 = tmp2 + sum(test2[test2['following_list'].isnull()]['user_id'] == tmp[i])

In [127]:
tmp2

0

In [133]:
test_read = pd.merge(test,read,how='left',left_on='user_id',right_on='user_id')

In [134]:
test_read.head()

Unnamed: 0,user_id,raw,dt,hr,article_id
0,#7ee14df8642a7925b1465ff5c89efe5b,#7ee14df8642a7925b1465ff5c89efe5b @seungyae613...,20190210,22,@seungyae613_22 @namgizaa_46 @connerstoneiqvk_...
1,#7ee14df8642a7925b1465ff5c89efe5b,#7ee14df8642a7925b1465ff5c89efe5b @thinktoomuc...,20190219,18,@thinktoomuch_63
2,#7ee14df8642a7925b1465ff5c89efe5b,#7ee14df8642a7925b1465ff5c89efe5b @thinktoomuc...,20190219,19,@thinktoomuch_63
3,#7ee14df8642a7925b1465ff5c89efe5b,#7ee14df8642a7925b1465ff5c89efe5b @thinktoomuc...,20190220,12,@thinktoomuch_80 @kyungajgba_60 @tenbody_1684 ...
4,#7ee14df8642a7925b1465ff5c89efe5b,#7ee14df8642a7925b1465ff5c89efe5b @infinitolee...,20190220,17,@infinitolee_70


In [136]:
sum(test_read['article_id'].isnull())

0

### g. dev.users

In [128]:
dev = pd.read_csv(directory + 'predict/test.users', header = None,names = ['user_id'])

In [129]:
dev.head()

Unnamed: 0,user_id
0,#7ee14df8642a7925b1465ff5c89efe5b
1,#8420b9385b282028eebf1ad6b4a221c0
2,#c9b31d8b64357f5854b1ba55b32eb6d3
3,#9bb1e13b5481fa3737af20870b25c723
4,#37d5f99a7f12c9ba90c4e2ac92e54ab6


In [130]:
dev2 = pd.merge(dev,users,how='left',left_on='user_id',right_on='user_id')

In [131]:
dev2.head()

Unnamed: 0,user_id,following_list,keyword_list2
0,#7ee14df8642a7925b1465ff5c89efe5b,"[@avecrhae, @gongdae-unni, @jhcharm1, @travel-...",[]
1,#8420b9385b282028eebf1ad6b4a221c0,"[@jhj3211, @ohuk2011, @heat0508, @sssfriend, @...",[]
2,#c9b31d8b64357f5854b1ba55b32eb6d3,"[@marso123, @forhappywomen, @kimpyogo, @brunch]",[]
3,#9bb1e13b5481fa3737af20870b25c723,"[@moonjakga, @aemae-human, @tamarorim, @syshin...",[]
4,#37d5f99a7f12c9ba90c4e2ac92e54ab6,"[@oms1225, @dryjshin, @roysday, @hanuuri, @cyy...",[]


In [132]:
dev2[dev2['following_list'].isnull()]

Unnamed: 0,user_id,following_list,keyword_list2
579,#28f6ed60737aafe587e3ec4eb53b23db,,
943,#f6317132d17cbc7f365514118ab68ec5,,
1504,#4dfad4d56fa3863b75e8907ff5f8a149,,
1819,#6b3f9ae509f7ce1925b7db79cdc50bfb,,
1847,#270f067fec220d7ab9e36ba69f8b94d4,,
2449,#053e31c7b0443fc51b2dad5703fa43b8,,
3034,#95486f4120b5c8cbc1163e136d83cf5f,,
3128,#401a3c8ac295dc84a7f368d77aab03ab,,
3280,#d2d09d0a95c6929604fe9d2501e441b1,,
4426,#1355cc9f0fbee79c47b9e748d4718717,,
