In [1]:
from glob import glob
from itertools import chain
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil
from torchvision import transforms
from torchvision import models
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.optim import lr_scheduler
from torch import optim
from torchvision.datasets import ImageFolder
from torchvision.utils import make_grid
import time
%matplotlib inline

In [2]:
from tqdm import tqdm

In [3]:
magazine = pd.read_json('data/magazine.json',lines=True)

In [4]:
magazine.shape

(27967, 2)

In [5]:
magazine.head()

Unnamed: 0,id,magazine_tag_list
0,38842,"[브런치북, 육아일기, 대화법, 들려주고픈이야기]"
1,11540,"[tea, food]"
2,11541,[food]
3,11546,"[브런치북, 일상, 시, 사람]"
4,11544,"[감성에세이, 노래, 음악에세이]"


In [6]:
metadata = pd.read_json('data/metadata.json',lines=True)

In [7]:
metadata.shape

(643104, 9)

metadata에서 id는 글의 아이디 ==> train데이타프레임에서 article_id로 불리게됨

metadata에서 user_id가 작가의 아이디

In [8]:
metadata.head()

Unnamed: 0,article_id,display_url,id,keyword_list,magazine_id,reg_ts,sub_title,title,user_id
0,782,https://brunch.co.kr/@bookdb/782,@bookdb_782,"[여행, 호주, 국립공원]",8982,1474944427000,세상 어디에도 없는 호주 Top 10,"사진으로 옮기기에도 아까운, 리치필드 국립공원",@bookdb
1,81,https://brunch.co.kr/@kohwang56/81,@kohwang56_81,"[목련꽃, 아지랑이, 동행]",12081,1463092749000,,[시] 서러운 봄,@kohwang56
2,4,https://brunch.co.kr/@hannahajink/4,@hannahajink_4,[],0,1447997287000,무엇 때문에,무엇을 위해,@hannahajink
3,88,https://brunch.co.kr/@bryceandjuli/88,@bryceandjuli_88,"[감정, 마음, 위로]",16315,1491055161000,,싫다,@bryceandjuli
4,34,https://brunch.co.kr/@mijeongpark/34,@mijeongpark_34,"[유럽여행, 더블린, 아일랜드]",29363,1523292942000,#7. 내 친구의 집은 어디인가,Dubliner#7,@mijeongpark


In [9]:
exdf = metadata[metadata["id"] == "@brunch_151"]

In [10]:
print(exdf["user_id"])

237936    @brunch
Name: user_id, dtype: object


In [11]:
users = pd.read_json('data/users.json',lines=True)

In [12]:
users.shape

(310758, 3)

In [13]:
users.head()

Unnamed: 0,following_list,id,keyword_list
0,"[@perytail, @brunch]",#901985d8bc4c481805c4a4f911814c4a,[]
1,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",#1fd89e9dcfa64b45020d9eaca54e0eed,[]
2,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",#1d94baaea71a831e1f33e1c6bd126ed5,[]
3,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",#04641c01892b12dc018b1410e4928c0d,[]
4,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",#65bcaff862aadff877e461f54187ab62,[]


In [14]:
read_file_lst = glob('data/read/*')

In [15]:
exclude_file_lst = ['read.tar']

In [16]:
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)

In [17]:
read = pd.concat(read_df_lst)

In [18]:
read.shape

(3507097, 5)

In [19]:
read.tail()

Unnamed: 0,raw,dt,hr,user_id,article_id
1227,#686cbcfd5dfa7e186dc3c28821170a5e @ah11_98 @ah...,20190228,23,#686cbcfd5dfa7e186dc3c28821170a5e,@ah11_98 @ah11_98 @ah11_97 @mothertive_66 @mot...
1228,#0c4c58048bfadedd650e92081b67d811 @brunch_151 ...,20190228,23,#0c4c58048bfadedd650e92081b67d811,@brunch_151 @brunch_151
1229,#3eec960b2ad12fc41ec986032effc8b2 @leewoosview...,20190228,23,#3eec960b2ad12fc41ec986032effc8b2,@leewoosview_186 @leewoosview_189
1230,#1eab0886c0f0f32156f9ab1e5d0fffab @rory_7 @ror...,20190228,23,#1eab0886c0f0f32156f9ab1e5d0fffab,@rory_7 @rory_7
1231,#005be6888ba3f083eed1806ba427cc3a @cliche-clic...,20190228,23,#005be6888ba3f083eed1806ba427cc3a,@cliche-cliche_1 @cliche-cliche_5


In [20]:
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

In [21]:
read_cnt_by_user = read['article_id'].str.split(' ').map(len)

In [22]:
read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

In [23]:
read_raw.shape

(22110706, 4)

In [24]:
read_raw.head()

Unnamed: 0,dt,hr,user_id,article_id
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140


연속해서 같은 article을 본 경우 read_raw 데이터프레임으로 만들경우 중복되는 raw가 생긴다. 따라서 중복되는 행을 아래를 통해 지워주도록 하자

In [25]:
train_data = read_raw.drop_duplicates()

In [26]:
train_data.shape

(14341063, 4)

In [27]:
train_data.head()

Unnamed: 0,dt,hr,user_id,article_id
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140


일단 유저가 어떤 article을 봤으면 기본점수 2을 부여

In [28]:
train_data["score"] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
train_data.head()

Unnamed: 0,dt,hr,user_id,article_id,score
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,2
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,2
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,2
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,2
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140,2


In [30]:
train_data = train_data.reset_index().drop(["index"],axis=1)

In [31]:
train_data.head()

Unnamed: 0,dt,hr,user_id,article_id,score
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,2
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,2
2,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,2
3,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,2
4,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140,2


유저가 본 article이 유저가 팔로우까지 한 작가의 글이었다면 3점을 더 부여

In [32]:
users.head()

Unnamed: 0,following_list,id,keyword_list
0,"[@perytail, @brunch]",#901985d8bc4c481805c4a4f911814c4a,[]
1,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",#1fd89e9dcfa64b45020d9eaca54e0eed,[]
2,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",#1d94baaea71a831e1f33e1c6bd126ed5,[]
3,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",#04641c01892b12dc018b1410e4928c0d,[]
4,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",#65bcaff862aadff877e461f54187ab62,[]


In [32]:
# for index, raw in train_data.iterrows():
#     articel_id = raw["article_id"]
#     user_id = raw["user_id"]
#     metadata_df = metadata[metadata["id"] == articel_id]
#     if metadata_df["user_id"].tolist() == []:
#         continue
#     else : author_id = metadata_df["user_id"].tolist()[0]
#     user_df = users[users["id"] == user_id]
#     for idx, row in user_df.iterrows():
#         if author_id in row["following_list"]:
#             raw["score"] += 2
#             continue
#     if index%100000 == 0 : print(index)

위의 코드를 통해 +2점을 주려고 했으나 처리속도가 정말정말 느리다

train_data와 metadata를 join시켜서 처리 속도를 빠르게 만들어야겠다.

두 데이타의 칼럼명을 일치시켜주고 합쳐주자

user_id라는 컬럼명은 같지만 다른 의미이므로 author_id라고 바꿔서 저자의 이름을 뜻하게 만들자

In [33]:
metadata = metadata.drop(["article_id"],axis=1)

In [34]:
metadata.rename(columns={'id':'article_id','user_id':'author_id'}, inplace=True)

In [35]:
metadata_single = metadata[['article_id','author_id']]

In [36]:
metadata_single.head()

Unnamed: 0,article_id,author_id
0,@bookdb_782,@bookdb
1,@kohwang56_81,@kohwang56
2,@hannahajink_4,@hannahajink
3,@bryceandjuli_88,@bryceandjuli
4,@mijeongpark_34,@mijeongpark


In [37]:
train_data_merge = pd.merge(train_data,metadata_single,on="article_id",how='left')

In [38]:
train_data_merge.head()

Unnamed: 0,dt,hr,user_id,article_id,score,author_id
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,2,@kty0613
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,2,@miamiyoung
2,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,2,@banksalad
3,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,2,@rlfrjsdn
4,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140,2,@readme999


merge한 데이터프레임과 users 데이터프레임까지 합쳐줘서 하나의 데이터프레임으로 만들고 score를 관리하자

users의 id칼럼의 이름을 user_id로 바꿔주고 합쳐주자

In [39]:
users.rename(columns={'id':'user_id'}, inplace=True)

In [40]:
users.head()

Unnamed: 0,following_list,user_id,keyword_list
0,"[@perytail, @brunch]",#901985d8bc4c481805c4a4f911814c4a,[]
1,"[@holidaymemories, @wadiz, @sciforus, @dailydu...",#1fd89e9dcfa64b45020d9eaca54e0eed,[]
2,"[@commerceguy, @sunsutu, @kakao-it, @joohoonja...",#1d94baaea71a831e1f33e1c6bd126ed5,[]
3,"[@amberjeon48, @forsy20, @nemotokki, @hawann, ...",#04641c01892b12dc018b1410e4928c0d,[]
4,"[@dwcha7342, @iammento, @kakao-it, @dkam, @ant...",#65bcaff862aadff877e461f54187ab62,[]


In [41]:
train_data_merge2 = pd.merge(train_data_merge,users,on="user_id",how='left')

In [42]:
train_data_merge2["author_id"] = train_data_merge2["author_id"].fillna(0)

In [43]:
train_data_merge2['following_list'] = [[] if type(x) != list else x for x in train_data_merge2['following_list']]

In [44]:
train_data_merge2.head()

Unnamed: 0,dt,hr,user_id,article_id,score,author_id,following_list,keyword_list
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,2,@kty0613,"[@kecologist, @darunwrite, @investup, @travie,...",[]
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,2,@miamiyoung,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
2,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,2,@banksalad,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
3,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,2,@rlfrjsdn,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
4,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140,2,@readme999,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."


In [45]:
train_data_merge2.shape

(14341063, 8)

위 데이터프레임을 보면 user_id 의 유저가 --> article_id의 글을 읽었으며 그 글의 점수는 X이고 글을 쓴 사람의 저자 아이디는  author_id이다. 그리고 유저의 팔로우 리스트에 읽은 글을 쓴 저자가 포함되어있다면 score 에 3점을 더해주자.

처음에는 iterrows를 쓰려고했는데 iterrows()는 너무나 느리다 너무나... apply가 빠른 방법이라고 하므로 함수를 만들어서 apply하자.

In [73]:
def score_maker(row):
    author_id = row["author_id"]
    following_list = row["following_list"]
    if author_id in following_list:
        row["score"] += 3
    return row["score"]
    

In [74]:
tqdm.pandas()

In [90]:
score_series = train_data_merge2.progress_apply(score_maker,axis=1)

100%|███████████████████████████████████████████████████████████████████| 14341063/14341063 [11:56<00:00, 20023.62it/s]


In [92]:
train_data_merge2["score"] = score_series

In [93]:
train_data_merge2.head()

Unnamed: 0,dt,hr,user_id,article_id,score,author_id,following_list,keyword_list
0,20181001,0,#e208be4ffea19b1ceb5cea2e3c4dc32c,@kty0613_91,5,@kty0613,"[@kecologist, @darunwrite, @investup, @travie,...",[]
1,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@miamiyoung_31,2,@miamiyoung,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
2,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@banksalad_49,2,@banksalad,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
3,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@rlfrjsdn_95,2,@rlfrjsdn,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."
4,20181001,0,#0a3d493f3b2318be80f391eaa00bfd1c,@readme999_140,2,@readme999,"[@hyunjooje, @posselavaboy, @mupasa324, @minse...","[{'cnt': 1, 'keyword': 'DDU 조건'}, {'cnt': 1, '..."


함수로 apply해서 적용하니까 빨라지긴했어도 몇십분이 걸린다...
만든 데이터프레임을 pickle로 저장해놓고 사용해야겠다

pickle은 대용량 데이터 저장에 너무 느리다 hdf를 이용하자

In [96]:
train_data_merge2.to_hdf("merge_data_real.h5",key="df",mode="w")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->['dt', 'hr', 'user_id', 'article_id', 'author_id', 'following_list', 'keyword_list']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [94]:
exdfs = train_data_merge2[train_data_merge2["score"]==5]

In [95]:
exdfs.shape

(4352204, 8)

In [42]:
p = []

In [43]:
a = 0

In [44]:
a in p

False