In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pylab as plt
import matplotlib
import matplotlib.font_manager as fm

try:
    font_location = "HANDotum.ttf"
    font_name = fm.FontProperties(fname = font_location).get_name()
    matplotlib.rc('font', family=font_name)
except:
    print("폰트 임포트 에러")

In [2]:
# 피봇 테이블 만들기
rv1 = pd.read_csv("Data/review1_pre.csv", index_col=[0])
rv1 = rv1.drop_duplicates(['Author', 'BookCode'], keep='first')
rv1 = rv1[~pd.isna(rv1["Author"])]

rv_book_pivot = rv1.pivot(index='Author', columns='BookCode', values='SumRate')
rv_book_pivot = rv_book_pivot.fillna(0)
us_list = rv_book_pivot.index
book_list = rv_book_pivot.columns

In [37]:
import MeCab
import re

def pos_mecab(sentence) :
    """

    :param sentence:
    :return:
    """
    m = MeCab.Tagger()

    # 저\tNP,*,F,저,Inflect,NP,NP,제/NP/*
    out = m.parse(sentence)

    sentences = out.split('\n')
    p = re.compile('(.*)\t(.*),(.*),(.*),(.*),(.*),(.*),(.*),(.*)')
    sentences = [p.findall(s) for s in sentences]

    tags = []
    for s in sentences :
        if len(s) > 0 and len(s[0]) > 0 :
            s = s[0]
            tags.append((s[0], s[1]))
    return tags

def token_mecab(sentence):
    res = pos_mecab(sentence)
    return [word for word, pos in res]

In [38]:
# 사이킷런의 tf-idf 계산기를 시험해 본다.
from sklearn.feature_extraction.text import TfidfVectorizer

rv_groupedby_isbn = rv1.groupby(["BookCode"])
isbn = 9791136242303
rvs = rv1[rv1["BookCode"] == isbn]["Content"]

tfidfv = TfidfVectorizer(tokenizer=token_mecab).fit(rvs)
print(tfidfv.transform(rvs).toarray())
print(tfidfv.vocabulary_)

[[0.32764719 0.0819118  0.         ... 0.09718985 0.06716885 0.        ]
 [0.         0.         0.         ... 0.12328372 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.17613178 0.0304316  0.03711105]
 [0.         0.         0.         ... 0.05493542 0.         0.        ]
 [0.         0.         0.19651456 ... 0.         0.         0.        ]]
{'날씨': 63, '의': 292, '아이': 238, '2': 10, '권': 44, '구매': 41, '했': 393, '습니다': 220, '신카이': 226, '마코토': 136, '감독': 27, '갓갓': 28, '작품': 317, '!': 0, '!###': 1, '소년': 212, '과': 40, '소녀': 211, '는': 76, '도시': 98, '에서': 263, '간신히': 24, '보금자리': 173, '를': 130, '찾아낸다': 351, '.': 7, '하지만': 382, ',': 6, '맑음': 146, '뒤': 111, '흐림': 399, '이': 294, '찾아오': 352, '듯': 115, '두': 109, '사람': 193, '은': 287, '세계': 207, '비밀': 188, '을': 289, '마주': 132, '하': 380, '게': 33, '되': 103, '는데': 78, '…': 18, '외딴': 277, '섬': 206, '가출': 22, '한': 384, '호다': 396, '카': 364, '가': 21, '도쿄': 99, '만



In [36]:
# 각 책 별로 합친 문서 리스트 만들기
# 책간 TF-IDF 유사도를 비교하기 위해서
book_rvs= []

for isbn in book_list:
    rvs = rv1[rv1["BookCode"] == isbn]["Content"]
    rv_doc = '\n\n'.join(rvs.tolist())
    book_rvs.append(rv_doc)

book_rvs = pd.DataFrame(book_rvs, index=book_list, columns=["ReviewDoc"])
book_rvs.head()

Unnamed: 0_level_0,ReviewDoc
BookCode,Unnamed: 1_level_1
8801748000000.0,"에르베튈레, 프랑스 일러스터의 책입니다. 예술의전당 전시회 보러 아이와 오가면, 알..."
8809124000000.0,책값이 왜 9900원 막 이래가지고 ...그냥 만원 하지.. 진짜 장사심보100원끼...
8809255000000.0,아이들이 좋아하는 흔한남매!떨어질 줄 모르는 유튜브 인기와출간 즉시 베스트셀러가 되...
8809255000000.0,힘든 코로나시대에 집콕생활에 힘들어 하는 아홉살난 남자아이조카를 위해 사줬습니다. ...
8809264000000.0,카드러버 아이와 함께 감정도 배우고 인사도 배우고 여러 표현법을 놀면서 익히니 재밌...


In [39]:
# 책 리뷰 문서 tf-idf 구하기
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfv = TfidfVectorizer(tokenizer=token_mecab).fit(book_rvs["ReviewDoc"])



NameError: name 'ReviewDoc' is not defined

In [40]:
print(tfidfv.transform(book_rvs["ReviewDoc"]).toarray())
print(tfidfv.vocabulary_)

[[0.05519465 0.         0.00253584 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.05926507 0.01012948 0.         ... 0.         0.         0.        ]
 ...
 [0.05751908 0.         0.         ... 0.         0.         0.        ]
 [0.01114279 0.02094957 0.02969249 ... 0.         0.         0.        ]
 [0.01584768 0.00145343 0.00102999 ... 0.         0.         0.        ]]


In [41]:
# 행렬을 구성
tfidf_mat = pd.DataFrame(tfidfv.transform(book_rvs["ReviewDoc"]).toarray(), index=book_list)
tfidf_mat

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,76116,76117,76118,76119,76120,76121,76122,76123,76124,76125
BookCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8.801748e+12,0.055195,0.000000,0.002536,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.809124e+12,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.809255e+12,0.059265,0.010129,0.000000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.809255e+12,0.051259,0.008380,0.005939,0.000000,0.0000,0.003405,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.809264e+12,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.791197e+12,0.020531,0.002969,0.000000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.791197e+12,0.102430,0.019258,0.004549,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.791197e+12,0.057519,0.000000,0.000000,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.791197e+12,0.011143,0.020950,0.029692,0.000000,0.0000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# 코사인 유사도
def cos_sim(A, B):
    return np.dot(A, B)/(np.linalg.norm(A)*np.linalg.norm(B))

In [43]:
book_num = len(book_list)
book_sim_table_tdidf = np.zeros((book_num, book_num))
book_sim_table_tdidf = pd.DataFrame(book_sim_table_tdidf, index=book_list, columns=book_list)
book_sim_table_tdidf

for i in tfidf_mat.index:
    for j in tfidf_mat.index:
        sim = cos_sim(np.array(tfidf_mat.loc[i,]), np.array(tfidf_mat.loc[j,]))
        book_sim_table_tdidf.loc[i, j] = sim

book_sim_table_tdidf

BookCode,8.801748e+12,8.809124e+12,8.809255e+12,8.809255e+12,8.809264e+12,8.809333e+12,8.809417e+12,8.809470e+12,8.809475e+12,8.809475e+12,...,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,1.844674e+19
BookCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8.801748e+12,1.000000,0.358897,0.250314,0.301908,0.284270,0.264786,0.384795,0.216382,0.235871,0.401326,...,0.176632,0.160687,0.308372,0.484070,0.272585,0.428101,0.453210,0.291172,0.437059,0.513463
8.809124e+12,0.358897,1.000000,0.193284,0.268686,0.235422,0.260729,0.427193,0.279819,0.513399,0.326834,...,0.149713,0.144250,0.295333,0.439004,0.279755,0.393967,0.440397,0.265846,0.472506,0.498533
8.809255e+12,0.250314,0.193284,1.000000,0.375990,0.179423,0.182414,0.249786,0.142929,0.132850,0.224319,...,0.107309,0.113695,0.177297,0.272485,0.170212,0.251416,0.276532,0.191416,0.269105,0.305944
8.809255e+12,0.301908,0.268686,0.375990,1.000000,0.245210,0.426746,0.311674,0.161515,0.169324,0.249118,...,0.135999,0.130600,0.237063,0.364668,0.209233,0.330526,0.369365,0.218907,0.325425,0.389720
8.809264e+12,0.284270,0.235422,0.179423,0.245210,1.000000,0.208206,0.290119,0.143543,0.180981,0.212524,...,0.127159,0.149038,0.200752,0.312391,0.190524,0.351237,0.382433,0.194473,0.280624,0.344962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9.791197e+12,0.428101,0.393967,0.251416,0.330526,0.351237,0.313397,0.449982,0.234738,0.242921,0.365680,...,0.198656,0.223315,0.328261,0.503451,0.292434,1.000000,0.622186,0.336889,0.487264,0.555632
9.791197e+12,0.453210,0.440397,0.276532,0.369365,0.382433,0.323423,0.469172,0.224988,0.222021,0.374864,...,0.201172,0.213361,0.335175,0.533702,0.329313,0.622186,1.000000,0.323409,0.486996,0.588723
9.791197e+12,0.291172,0.265846,0.191416,0.218907,0.194473,0.246560,0.293911,0.188201,0.182853,0.289188,...,0.168869,0.150583,0.239279,0.361502,0.233759,0.336889,0.323409,1.000000,0.399932,0.401891
9.791197e+12,0.437059,0.472506,0.269105,0.325425,0.280624,0.349740,0.452531,0.249032,0.293731,0.419188,...,0.223504,0.176463,0.371636,0.557778,0.507336,0.487264,0.486996,0.399932,1.000000,0.648194


In [44]:
book_sim_table_tdidf.to_csv("Data/sim_tfidf.csv")

# 모델 구축 및 평가
## 점수 예측식 만들기

In [149]:
book_sim_table = pd.read_csv("Data/sim_normal.csv", index_col=[0])
book_sim_table.columns = book_sim_table.columns.astype('float64')
book_sim_table_tfidf = pd.read_csv("Data/sim_tfidf.csv", index_col=[0])
book_sim_table_tfidf.columns = book_sim_table_tfidf.columns.astype('float64')

In [109]:
# 상위 n개 근접 이웃 찾기
def get_nearest_books(sim, isbn, n=10) :
    n_book = sim.loc[isbn, :].sort_values(ascending=False).iloc[1:n+1]
    return n_book

In [154]:
def sim_comb_method(a, b):
    # 두 simmilarity의 조합 방법
    return (a + b) / 2.0

val = np.zeros(rv_book_pivot.shape)
predicted_scores = pd.DataFrame(val, index=us_list, columns=book_list)
k = 10

for i in range(predicted_scores.shape[0]):
    cur_user = us_list[i]
    # 밑에 부분은 행렬로 가능할 것 같은데..
    for j in range(predicted_scores.shape[1]):
        isbn = book_list[j]
        nearest_books_a = get_nearest_books(book_sim_table, isbn, k)
        nearest_books_b = get_nearest_books(book_sim_table_tdidf, isbn, k)
        sim_books = set()
        n = 0
        while n < k and len(sim_books) < k:
            sim_books.add(nearest_books_a.index[n])
            sim_books.add(nearest_books_b.index[n])
            n += 1
        sim_books = list(sim_books)
        sim_a = book_sim_table.loc[isbn, sim_books]
        sim_b = book_sim_table_tdidf.loc[isbn, sim_books]
        sim_comb = sim_comb_method(sim_a, sim_b)
        rv_scores = rv_book_pivot.loc[cur_user, sim_books]
        sim_sum = sim_comb[rv_scores>0].sum()
        rate_sum = np.multiply(sim_comb, rv_scores).sum()
        predicted_scores.iloc[i, j] = 0
        if sim_sum > 0:
            predicted_scores.iloc[i, j] = rate_sum / sim_sum

predicted_scores

BookCode,8.801748e+12,8.809124e+12,8.809255e+12,8.809255e+12,8.809264e+12,8.809333e+12,8.809417e+12,8.809470e+12,8.809475e+12,8.809475e+12,...,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,9.791197e+12,1.844674e+19
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'_'*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
**01 22 2020 9:45PM**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
**09 4 2018 1:15PM**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
**09 12 2017 10:14AM**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
**10 27 2017 4:24PM**,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
히또리도리돌,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0
히야신스,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
히이이익,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
히키,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
def f1_with_rating(real_score, predicted_score, rate_threshold=8.0):
    tr = 0 # True
    tp = 0 # True Positive
    ins_num = 0 # prediction number
    rec_num = 0 # recommended number
    maprec = 0.0
    for i in range(len(real_score.columns)):
        for j in range(len(real_score.index)):
            real_rate = real_score.iloc[j, i] >= rate_threshold
            pred_rate = predicted_score.iloc[j, i] >= rate_threshold
            
            if predicted_score.iloc[j, i] > 0 :
                ins_num += 1 # 무언가를 예측한 경우에만 올라간다?
            
            if real_rate:
                tr += 1
                if pred_rate:
                    tp += 1 
            
            if pred_rate:
                rec_num += 1
                maprec += rec_num / ins_num
    
    maprec = maprec / rec_num
    recall = tp / tr
    print("MAP:{0} Recall:{1}".format(maprec, recall))
    
    f1 = 2 * maprec * recall / (maprec + recall)
    f0d5 = 1.25 * maprec * recall / (0.25 * maprec + recall)
    return f1, f0d5
    
f1_with_rating(rv_book_pivot, predicted_scores, rate_threshold=9.0)

MAP:0.7674458756358944 Recall:0.5420192588269623


(0.6353287823468593, 0.7085116643975466)

In [156]:
predicted_scores.to_csv("Data/pred_tfidf_comb1.csv")

## 결과 분석

이제부터는 나온 결과의 원인을 분석하고, 대안을 논해볼 것이다.