In [1]:
import pandas as pd
import numpy as np
from db import get_pd_from_table, connect_db
import os

DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///data.db')
engine, connection, metadata = connect_db(DATABASE_URL)

In [2]:
ratings = pd.read_csv('data/watcha_ratings.csv', encoding='utf-8')
ratings.drop(['isbn'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,username,rate,user_id,book_id
0,백준,7.0,4811,76044
1,MIN,4.0,865,76044
2,민지,6.0,4404,76044
3,ES,8.0,326,76044
4,귀를 기울이면,6.0,2786,76044


In [3]:
item_meta = get_pd_from_table('books', engine, connection, metadata)
item_meta.drop(['isbn', 'publisher', 'pubDate', 'img', 'rate', 'bestseller', 'sense'], axis=1, inplace=True)
item_meta.replace(np.nan, '', inplace=True)

# 제목, 저자, 줄거리, 장르 합치기
summaries = []
for book in item_meta.values:
    summary = book[1]
    summary += book[2]
    summary += book[3]
    summary += book[4]
    summaries.append(summary)
item_meta['summary'] = summaries

item_meta.dropna(axis=0, inplace=True)
item_meta.head()

Unnamed: 0,id,title,author,summary,genre
0,1,리스트 고독한가운데 신의 축복 (984),마스트미디어 편집부 (지은이),리스트 고독한가운데 신의 축복 (984)마스트미디어 편집부 (지은이)예술/대중문화,예술/대중문화
1,2,거짓말이다 + 꽃다발 상품권 세트,김탁환 (지은이),거짓말이다 + 꽃다발 상품권 세트김탁환 (지은이)김탁환 작가의 &lt;거짓말이다&g...,소설/시/희곡
2,3,[헌금봉투] 주일헌금 201 (1속50매) - VJ-1311-201,비전북 편집부 (지은이),[헌금봉투] 주일헌금 201 (1속50매) - VJ-1311-201비전북 편집부 (...,종교/역학
3,4,[헌금봉투] 주일헌금 202 (1속50매) - VJ-1311-202,비전북 편집부 (지은이),[헌금봉투] 주일헌금 202 (1속50매) - VJ-1311-202비전북 편집부 (...,종교/역학
4,5,엘르 Elle E형 2022.8 (표지 : NCT 재현) (부록없음) - 주요기사 ...,허스트중앙 편집부 (지은이),엘르 Elle E형 2022.8 (표지 : NCT 재현) (부록없음) - 주요기사 ...,잡지


In [4]:
user_meta = get_pd_from_table('users', engine, connection, metadata)
user_meta.drop(['username', 'password', 'name', 'created_dt', 'interest'], axis=1, inplace=True)
user_meta.dropna(axis=0, inplace=True)
user_meta.head()

Unnamed: 0,id,age,sex
0,1,20.0,M


In [5]:
book_list = get_pd_from_table('book_list', engine, connection, metadata)
book_list.drop(['status', 'review', 'created_dt', 'modified_dt'], axis=1, inplace=True)
book_list.dropna(axis=0, inplace=True)

new_book_list = None
for i in user_meta['id']:
    temp_book_list = book_list[book_list.user_id == i]
    if new_book_list is None:
        new_book_list = temp_book_list
    else:
        new_book_list = pd.concat([new_book_list, temp_book_list])
new_book_list

Unnamed: 0,user_id,book_id,rate
1,1,6469,10.0
4,1,7361,7.0
5,1,8519,5.0
6,1,15327,7.0
8,1,29405,6.0
9,1,33215,7.0
10,1,36021,9.0
12,1,43342,5.0
14,1,57505,9.0
15,1,57859,5.0


In [6]:
# 실제 db에 있는 유저의 최대 id값 가져오기
max_user_id = user_meta.id.max()

# 유저 고유 id 부여
ratings = ratings.sort_values(by='username', ascending=True)

user_id = max_user_id + 1
temp = None
result_id = []
for i in range(len(ratings)):
    if temp is None:
        temp = ratings.iloc[i]['username']
        user_id = user_id
        result_id.append(user_id)
    else:
        if temp == ratings.iloc[i]['username']:
            result_id.append(user_id)
        else:
            temp = ratings.iloc[i]['username']
            user_id += 1
            result_id.append(user_id)
ratings['user_id'] = result_id
ratings.drop(['username'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,rate,user_id,book_id
21846,7.0,2,14918
3623,7.0,3,67442
21743,9.0,3,14947
27763,8.0,4,6636
24509,10.0,4,9780


In [7]:
# ratings의 유저를 임의로 나이, 성별 부여
# 10대 1, 20~30대 2, 40~50대 3
# user_id, 나이, 성별 순
num = ratings.user_id.nunique() // 3
user_id_list = ratings.user_id.unique()
data = []
for i, user_id in enumerate(user_id_list):
    if i < num:
        data.append((user_id, 1, 0))
    elif i < 2 * num:
        data.append((user_id, 2, 1))
    else:
        data.append((user_id, 3, 0))
df = pd.DataFrame(data, columns=['id', 'age', 'sex'])
df.head()

Unnamed: 0,id,age,sex
0,2,1,0
1,3,1,0
2,4,1,0
3,5,1,0
4,6,1,0


In [8]:
# 실제 users 테이블의 값도 수정
user_meta_age = []
for age in user_meta['age']:
    if age < 20:
        user_meta_age.append(1)
    elif age < 40:
        user_meta_age.append(2)
    else:
        user_meta_age.append(3)
user_meta['age'] = user_meta_age

user_meta_sex = []
for sex in user_meta['sex']:
    if sex == 'F':
        user_meta_sex.append(0)
    else:
        user_meta_sex.append(1)
user_meta['sex'] = user_meta_sex

user_meta.head()

Unnamed: 0,id,age,sex
0,1,2,1


In [9]:
# ratings에서 부여한 user 정보 합치기
user_meta = pd.concat([user_meta, df], ignore_index=True)
user_meta.rename(columns={'id':'user_id'}, inplace=True)
user_meta.head()

Unnamed: 0,user_id,age,sex
0,1,2,1
1,2,1,0
2,3,1,0
3,4,1,0
4,5,1,0


In [10]:
# 크롤링 평점 데이터랑 실제 평점 데이터 합치기
ratings = pd.concat([ratings, new_book_list])
ratings = ratings.sort_values(by='user_id', ascending=True)
ratings.head(30)

Unnamed: 0,rate,user_id,book_id
17,7.0,1,63544
14,9.0,1,57505
12,5.0,1,43342
10,9.0,1,36021
9,7.0,1,33215
8,6.0,1,29405
6,7.0,1,15327
5,5.0,1,8519
4,7.0,1,7361
1,10.0,1,6469


In [11]:
# ratings에 있는 책과 item_meta에 겹치는 책만 남기고 삭제
new_item_meta = None
for i in ratings.book_id.unique():
    temp_item_meta = item_meta[item_meta.id == i]
    if new_item_meta is None:
        new_item_meta = temp_item_meta
    else:
        new_item_meta = pd.concat([new_item_meta, temp_item_meta])
item_meta = new_item_meta
item_meta.head()


Unnamed: 0,id,title,author,summary,genre
63437,63544,우리가 사랑한 빵집 성심당 - 모두가 행복한 경제,김태훈 (지은이),우리가 사랑한 빵집 성심당 - 모두가 행복한 경제김태훈 (지은이)교황의 식탁을 위해...,경제경영
57408,57505,"그릿 GRIT (100쇄 기념 리커버 에디션) - IQ, 재능, 환경을 뛰어넘는 열...","앤절라 더크워스 (지은이), 김미정 (옮긴이)","그릿 GRIT (100쇄 기념 리커버 에디션) - IQ, 재능, 환경을 뛰어넘는 열...",자기계발
43268,43342,미움받을 용기 (반양장) - 자유롭고 행복한 삶을 위한 아들러의 가르침,"기시미 이치로, 고가 후미타케 (지은이), 전경아 (옮긴이), 김정운 (감수)",미움받을 용기 (반양장) - 자유롭고 행복한 삶을 위한 아들러의 가르침기시미 이치로...,인문학
35960,36021,파인만의 여섯가지 물리 이야기 - 보급판,"박병철 (옮긴이), 폴 데이비스 (서문)","파인만의 여섯가지 물리 이야기 - 보급판박병철 (옮긴이), 폴 데이비스 (서문)칼텍...",과학
33159,33215,연금술사,"파울로 코엘료 (지은이), 최정수 (옮긴이)","연금술사파울로 코엘료 (지은이), 최정수 (옮긴이)세상을 두루두루 여행하기 위해 양...",소설/시/희곡


In [12]:
from gensim.models import Word2Vec
from konlpy.tag import Mecab
from tqdm import tqdm

# 불용어 가져오기
stopwords = []
with open("data/hangul_stopword.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()  # 줄 끝의 줄 바꿈 문자를 제거한다.
        line = line.replace('\n', '')
        stopwords.append(line)

# summary 전처리
item_meta['summary'] = item_meta['summary'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
mecab = Mecab()
corpus_list = []
for index, row in tqdm(item_meta.iterrows(), total=len(item_meta)):
    text = row['summary']
    tokenized_sentence = mecab.morphs(text)
    stopwords_removed_sentence = [
        word for word in tokenized_sentence if not word in stopwords]  # 불용어 제거
    corpus_list.append(stopwords_removed_sentence)
item_meta['summary'] = corpus_list

# summary 내용 추가 학습
word2vec_model = Word2Vec.load('data/ko.bin')
word2vec_model.wv.save_word2vec_format('data/ko.bin.gz', binary=True)
word2vec_model = Word2Vec(size=200, window=3, min_count=2, workers=8)
word2vec_model.build_vocab(corpus_list)
word2vec_model.intersect_word2vec_format(
    'data/ko.bin.gz', lockf=1.0, binary=True)
word2vec_model.train(
    corpus_list, total_examples=word2vec_model.corpus_count, epochs=20)

word2vec_model.wv.most_similar('아이디어')

  item_meta['summary'] = item_meta['summary'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
100%|██████████| 6036/6036 [00:01<00:00, 3727.09it/s]


[('인사이트', 0.5170837044715881),
 ('노하우', 0.5130617022514343),
 ('결과물', 0.5025250911712646),
 ('키워드', 0.49221381545066833),
 ('도구', 0.4866131544113159),
 ('이미지', 0.48651403188705444),
 ('실마리', 0.478988915681839),
 ('컨셉', 0.4767146110534668),
 ('과제', 0.47194552421569824),
 ('커뮤니티', 0.4640359878540039)]

In [13]:
def get_document_vectors(document_list, word2vec_model):
    document_embedding_list = []

    # 각 문서에 대해서
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line:
            if word in word2vec_model.wv.vocab:
                count += 1
                # 해당 문서에 있는 모든 단어들의 벡터값을 더한다.
                if doc2vec is None:
                    doc2vec = word2vec_model[word]
                else:
                    doc2vec = doc2vec + word2vec_model[word]

        if doc2vec is not None:
            # 단어 벡터를 모두 더한 벡터의 값을 문서 길이로 나눠준다.
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)

    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

In [14]:
# summary 문서 벡터 추출
document_embedding_list = get_document_vectors(item_meta['summary'], word2vec_model)
item_meta['summary_vec'] = document_embedding_list

# type tuple로 변환 (unhashable type 방지)
summary_vec = []
for item in item_meta['summary_vec']:
    summary_vec.append(tuple(item))
item_meta['summary_vec'] = summary_vec
item_meta.rename(columns={'id':'book_id'}, inplace = True)
item_meta.reset_index(drop=True, inplace=True)
item_meta.head()

  doc2vec = word2vec_model[word]
  doc2vec = doc2vec + word2vec_model[word]


Unnamed: 0,book_id,title,author,summary,genre,summary_vec
0,63544,우리가 사랑한 빵집 성심당 - 모두가 행복한 경제,김태훈 (지은이),"[사랑, 빵집, 성심, 당, 모두, 행복, 경제, 김태훈, 지은, 교황, 의, 식탁...",경제경영,"(0.069265336, -0.35880974, 0.44103593, 0.13231..."
1,57505,"그릿 GRIT (100쇄 기념 리커버 에디션) - IQ, 재능, 환경을 뛰어넘는 열...","앤절라 더크워스 (지은이), 김미정 (옮긴이)","[그릿, 쇄, 기념, 리, 커버, 에디션, 재능, 환경, 을, 뛰어넘, 는, 열정,...",자기계발,"(-0.42248136, -0.24336463, 0.42332086, -0.0089..."
2,43342,미움받을 용기 (반양장) - 자유롭고 행복한 삶을 위한 아들러의 가르침,"기시미 이치로, 고가 후미타케 (지은이), 전경아 (옮긴이), 김정운 (감수)","[미움, 을, 용기, 반양장, 자유, 롭, 고, 행복, 삶, 을, 위한, 아들러, ...",인문학,"(0.11949557, -0.5288523, 0.54549366, 0.2884034..."
3,36021,파인만의 여섯가지 물리 이야기 - 보급판,"박병철 (옮긴이), 폴 데이비스 (서문)","[파인만, 의, 여섯, 물리, 이야기, 보급판, 박병철, 옮긴, 폴, 데이비스, 서...",과학,"(-0.1206058, -0.1867564, 0.21886791, -0.035711..."
4,33215,연금술사,"파울로 코엘료 (지은이), 최정수 (옮긴이)","[연금술사, 파울로, 코엘료, 지은, 최정수, 옮긴, 세상, 을, 두루두루, 여행,...",소설/시/희곡,"(-0.036706124, -0.30730042, 0.26515332, 0.0149..."


In [15]:
item_meta.reset_index(drop=True, inplace=True)
user_meta.reset_index(drop=True, inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [16]:
ratings.head()

Unnamed: 0,rate,user_id,book_id
0,7.0,1,63544
1,9.0,1,57505
2,5.0,1,43342
3,9.0,1,36021
4,7.0,1,33215


In [17]:
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from scipy.io import mmwrite

ratings_source = [(ratings['user_id'][i],
                    ratings['book_id'][i],
                    ratings['rate'][i]) for i in range(ratings.shape[0])]

item_meta = item_meta[['book_id', 'title', 'author', 'genre', 'summary_vec']]
item_features_source = [(item_meta['book_id'][i],
                        [item_meta['summary_vec'][i]]) for i in range(item_meta.shape[0])]

user_meta = user_meta[['user_id', 'age', 'sex']]
user_features_source = [(user_meta['user_id'][i],
                        [user_meta['age'][i],
                        user_meta['sex'][i]]) for i in range(user_meta.shape[0])]



In [18]:
dataset = Dataset()
dataset.fit(users=ratings['user_id'].unique(),
            items=ratings['book_id'].unique(),
            user_features=user_meta[user_meta.columns[1:]].values.flatten(),
            item_features=item_meta[item_meta.columns[1:]].values.flatten()
            )

In [19]:
interactions, weights = dataset.build_interactions(ratings_source)
user_features = dataset.build_user_features(user_features_source)
item_features = dataset.build_item_features(item_features_source)

# Save
mmwrite('data/interactions.mtx', interactions)
mmwrite('data/item_features.mtx', item_features)
mmwrite('data/user_features.mtx', user_features)
mmwrite('data/weights.mtx', weights)

# Split Train, Test data
train, test = random_train_test_split(interactions, test_percentage=0.1)
train, test = train.tocsr().tocoo(), test.tocsr().tocoo()
train_weights = train.multiply(weights).tocoo()

In [20]:
from hyperopt import fmin, hp, tpe, Trials

# Define Search Space
trials = Trials()
space = [hp.choice('no_components', range(10, 50, 10)),
         hp.uniform('learning_rate', 0.01, 0.05)]

In [28]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score
# Define Objective Function
def objective(params):
    no_components, learning_rate = params

    model = LightFM(no_components=no_components,
                    learning_schedule='adagrad',
                    loss='warp',
                    learning_rate=learning_rate,
                    random_state=0)

    model.fit(interactions=train,
              item_features=item_features,
              sample_weight=train_weights,
              epochs=10,
              verbose=False)

    test_precision = precision_at_k(model, test, k=5, item_features=item_features).mean()
    print("no_comp: {}, lrn_rate: {:.5f}, precision: {:.5f}".format(
      no_components, learning_rate, test_precision))
    # test_auc = auc_score(model, test, item_features=item_features).mean()
    output = -test_precision

    if np.abs(output+1) < 0.01 or output < -1.0:
        output = 0.0

    return output

In [22]:
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
print(best_params)

no_comp: 20, lrn_rate: 0.04179, precision: 0.01157    
no_comp: 20, lrn_rate: 0.02767, precision: 0.01128                                 
no_comp: 20, lrn_rate: 0.02036, precision: 0.01110                                 
no_comp: 10, lrn_rate: 0.02241, precision: 0.00806                                 
no_comp: 30, lrn_rate: 0.02831, precision: 0.01252                                 
no_comp: 40, lrn_rate: 0.01686, precision: 0.01081                                 
no_comp: 30, lrn_rate: 0.03078, precision: 0.01271                                 
no_comp: 40, lrn_rate: 0.02074, precision: 0.01157                                 
no_comp: 30, lrn_rate: 0.02252, precision: 0.01176                                 
no_comp: 30, lrn_rate: 0.04234, precision: 0.01366                                 
100%|██████████| 10/10 [00:10<00:00,  1.02s/trial, best loss: -0.013655760325491428]
{'learning_rate': 0.04234259642423113, 'no_components': 2}


In [34]:
model = LightFM(no_components=best_params['no_components'],
                learning_schedule='adagrad',
                loss='warp',
                learning_rate=best_params['learning_rate'],
                random_state=2022)

model.fit(interactions=train,
            item_features=item_features,
            user_features=user_features,
            sample_weight=train_weights,
            epochs=10,
            verbose=False)

<lightfm.lightfm.LightFM at 0x17f1e1c40>

In [35]:
score = auc_score(model, interactions, item_features=item_features, user_features=user_features).mean()
score

0.76909983

In [52]:
print(type(item_meta['book_id'].values.tolist()))
scores = model.predict(1,
                        item_ids=item_meta['book_id'].values.tolist(),
                        item_features=item_features,
                        user_features=user_features
                        )
scores

<class 'list'>


Exception: Number of item feature rows does not equal the number of items

In [27]:
# Find Similar Items
item_biases, item_embeddings = model.get_item_representations(features=item_features)

def make_best_items_report(item_embeddings, book_id, num_search_items=10):
    item_id = book_id - 1

    # Cosine similarity
    scores = item_embeddings.dot(item_embeddings[item_id])  # (10000, )
    item_norms = np.linalg.norm(item_embeddings, axis=1)    # (10000, )
    item_norms[item_norms == 0] = 1e-10
    scores /= item_norms

    # best: score가 제일 높은 item의 id를 num_search_items 개 만큼 가져온다.
    best = np.argpartition(scores, -num_search_items)[-num_search_items:]
    similar_item_id_and_scores = sorted(zip(best, scores[best] / item_norms[item_id]),
                                        key=lambda x: -x[1])

    # Report를 작성할 pandas dataframe
    best_items = pd.DataFrame(columns=['book_id', 'title', 'author', 'genre', 'score'])

    for similar_item_id, score in similar_item_id_and_scores:
        try:
            book_id = similar_item_id + 1
            title = item_meta[item_meta['book_id'] == book_id].values[0][1]
            author = item_meta[item_meta['book_id'] == book_id].values[0][2]
            genre = item_meta[item_meta['book_id'] == book_id].values[0][3]

            row = pd.Series([book_id, title, author, genre, score], index=best_items.columns)
            best_items = best_items.append(row, ignore_index=True)
        except:
            pass

    return best_items


# book_id 921: 경제학 콘서트
book_id = 921
report01 = make_best_items_report(item_embeddings, book_id, 20)
report01

  best_items = best_items.append(row, ignore_index=True)
  best_items = best_items.append(row, ignore_index=True)
  best_items = best_items.append(row, ignore_index=True)
  best_items = best_items.append(row, ignore_index=True)


Unnamed: 0,book_id,title,author,genre,score
0,921,경제학 콘서트,"팀 하포드 (지은이), 김명철 (옮긴이)",경제경영,1.0
1,990,이탈리아 기행 1,"요한 볼프강 폰 괴테 (지은이), 홍성광 (옮긴이)",에세이,0.998394
2,1224,잭 웰치의 마지막 강의 - 경영의 신 잭 웰치 60년 비즈니스 노하우의 모든 것,"잭 웰치, 수지 웰치 (지은이), 강주헌 (옮긴이)",경제경영,0.998374
3,1024,"사랑, 고마워요 고마워요 - 당신에게 묻고 싶고, 듣고 싶은 말 12가지",이미나 (지은이),에세이,0.995045
