In [1]:
import re
from konlpy.tag import Hannanum
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import warnings
import numpy as np
from collections import Counter

warnings.filterwarnings("ignore")

In [2]:
def make_preprocessing(df):
    # 1. 저자와 책 제목을 title 컬럼으로 만들어줌 (띄어쓰기로 구분)
    df = df[df['TITLE_NM'].str.contains("[\uac00-\ud7a3\u3131-\u3163\uac01-\ud7a6]+")] # 한국어 외 삭제
    df['TITLE_NM'] = df['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x)) # 숫자 외 특수문자 제거

    # 2. TITLE_NM의 양쪽 공백 제거
    df['TITLE_NM'] = df['TITLE_NM'].str.strip()

    # 3. 중복 값 삭제
    df = df.drop_duplicates(subset='TITLE_NM')

    return df

In [3]:
def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:
        print(b_title)

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

In [4]:
def maek_vector_data(data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    return data_1, vectorizer1, vectorized_data1, data_2, vectorizer2, vectorized_data2, data_3, vectorizer3, vectorized_data3,\
            data_4, vectorizer4, vectorized_data4, data_5, vectorizer5, vectorized_data5, data_6, vectorizer6, vectorized_data6,\
            data_7, vectorizer7, vectorized_data7,data_8, vectorizer8, vectorized_data8,data_9, vectorizer9, vectorized_data9

In [27]:
def make_score(test_data, data_1, vectorizer1,vectorized_data1,\
              data_2, vectorizer2,vectorized_data2, data_3, vectorizer3,vectorized_data3,\
              data_4, vectorizer4,vectorized_data4, data_5, vectorizer5,vectorized_data5,\
              data_6, vectorizer6,vectorized_data6, data_7, vectorizer7,vectorized_data7,\
              data_8, vectorizer8,vectorized_data8, data_9, vectorizer9, vectorized_data9, n=20):
    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    hannanum = Hannanum()
    
    score_list = []
    for num in range(1, 12):
        print(f"{num}월")
        similar_titles_list = []
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        similar_titles_list = [item for sublist in similar_titles_list for item in sublist]
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num}월 : {result}")
        score_list.append(result)
    
    return score_list

In [6]:
data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)
data.head()

Unnamed: 0,TITLE_NM,AUTHR_NM,KDC_NM,COUNTING,LBRRY_CD
0,세대,진 트웬지 지음 ;김현정 옮김,331.233,6,5300
1,법률가,김동일 ;어윤경 ;최윤정 지음,372.68,8,5301
10,페미니즘 그녀들의이야기,김효진 지음,809.9,2,8400
11,세대 스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음,331.233,1,4504
12,세대스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음;김현정 옮김,331.233,6,30002


In [36]:
yangchon = pd.read_csv("./test_data/TEST_YANGCHEON.csv")
yangchon.head()

Unnamed: 0,TITLE_NM,AUTHR_NM,KDC_NM,LBRRY_NM,LOAN_DATE
0,미키7 : 애드워드 애슈턴 SF 장편소설,애드워드 애슈턴 지음 ; 배지혜 옮김,843.6-애56ㅁ,[스마트]양천중앙,22/12/31 21:45:38
1,"마흔, 부부가 함께 은퇴합니다 : 5년 만에 40대 조기 은퇴에 성공한, 금융맹 부...",김다현 지음,327.04-김22ㅁ,[스마트]양천25시(오목교역),22/12/31 21:15:56
2,(조셉 필라테스의)필라테스 바이블,"조셉 필라테스,저드 로빈스,린 반 휴트-로빈스 [공]엮음 ; 원정희 옮김",517.32-필292ㅍ,[스마트] 신정네거리,22/12/31 20:51:24
3,헤어질 결심 각본,"정서경,박찬욱 지음",812.66-정54ㅎ,[스마트]양천25시(오목교역),22/12/31 17:36:14
4,너무 잘하려고 애쓰지 마라,나태주 지음,811.6-나883너,[스마트]양천중앙,22/12/31 17:35:47


In [21]:
da1, vc1, vd1, da2, vc2, vd2, da3, vc3, vd3,da4, vc4, vd4, da5, vc5, vd5, da6, vc6, vd6, da7, vc7, vd7, da8, vc8, vd8, da9, vc9, vd9 = maek_vector_data(data, n= 20)

In [30]:
result_score = make_score(yangchon, da1, vc1, vd1, da2, vc2, vd2, da3, vc3, vd3, da4, vc4, vd4, da5, vc5, vd5, da6, vc6, vd6, da7, vc7, vd7, da8, vc8, vd8, da9, vc9, vd9)

1월


ValueError: empty vocabulary; perhaps the documents only contain stop words

---

# 양천구

In [7]:
def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

def maek_score(data, test_data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")

    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    score_list = []
    for num in range(1, 12):
        similar_titles_list = []
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        similar_titles_list = [item for sublist in similar_titles_list for item in sublist]
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num+1}월 : {result}")
        score_list.append(result)
    
    return score_list

In [39]:
data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)
data.head()

Unnamed: 0,TITLE_NM,AUTHR_NM,KDC_NM,COUNTING,LBRRY_CD
0,세대,진 트웬지 지음 ;김현정 옮김,331.233,6,5300
1,법률가,김동일 ;어윤경 ;최윤정 지음,372.68,8,5301
10,페미니즘 그녀들의이야기,김효진 지음,809.9,2,8400
11,세대 스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음,331.233,1,4504
12,세대스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음;김현정 옮김,331.233,6,30002


In [41]:
yangchon = pd.read_csv("./test_data/TEST_YANGCHEON.csv")
yangchon.head()

Unnamed: 0,TITLE_NM,AUTHR_NM,KDC_NM,LBRRY_NM,LOAN_DATE
0,미키7 : 애드워드 애슈턴 SF 장편소설,애드워드 애슈턴 지음 ; 배지혜 옮김,843.6-애56ㅁ,[스마트]양천중앙,22/12/31 21:45:38
1,"마흔, 부부가 함께 은퇴합니다 : 5년 만에 40대 조기 은퇴에 성공한, 금융맹 부...",김다현 지음,327.04-김22ㅁ,[스마트]양천25시(오목교역),22/12/31 21:15:56
2,(조셉 필라테스의)필라테스 바이블,"조셉 필라테스,저드 로빈스,린 반 휴트-로빈스 [공]엮음 ; 원정희 옮김",517.32-필292ㅍ,[스마트] 신정네거리,22/12/31 20:51:24
3,헤어질 결심 각본,"정서경,박찬욱 지음",812.66-정54ㅎ,[스마트]양천25시(오목교역),22/12/31 17:36:14
4,너무 잘하려고 애쓰지 마라,나태주 지음,811.6-나883너,[스마트]양천중앙,22/12/31 17:35:47


In [42]:
score_result = maek_score(data, yangchon, n=20)

1월
0.6263529033993118
2월
0.6526568817340849
3월
0.6415646037679946
4월
0.67673078819684
5월
0.6544180471523545
6월
0.6882998256452502
7월
0.7142331369081706
8월
0.6781380083285854
9월
0.6932941926034685
10월
0.6695793241208458
11월
0.6841502895938003


In [45]:
np.mean(score_result)

0.6708561819500642

# 동대문구

In [5]:
data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)
data.head()

Unnamed: 0,TITLE_NM,AUTHR_NM,KDC_NM,COUNTING,LBRRY_CD
0,세대,진 트웬지 지음 ;김현정 옮김,331.233,6,5300
1,법률가,김동일 ;어윤경 ;최윤정 지음,372.68,8,5301
10,페미니즘 그녀들의이야기,김효진 지음,809.9,2,8400
11,세대 스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음,331.233,1,4504
12,세대스마트폰을 손에 쥐고 자란 요즘 세대 이야기,진 트웬지 지음;김현정 옮김,331.233,6,30002


In [6]:
dongdea = pd.read_csv("./test_data/TEST_DONGDAEMOON.csv")
dongdea.head()

Unnamed: 0,TITLE_NM,KDC_NM,LOAN_DATE
0,요리코를 위해 : 노리즈키 린타로 장편소설,833.6-ㄴ65요=2,2022-02-22 18:47
1,피프티 피플 : 정세랑 장편소설,813.7-ㅈ416ㅍ=3,2022-02-22 18:46
2,그 환자,843.6-ㄷ96ㄱ=3,2022-02-19 10:55
3,방구석 미술관 : 가볍고 편하게 시작하는 유쾌한 교양 미술,650.4-ㅈ664ㅂ=4,2022-01-25 19:09
4,나미야 잡화점의 기적 : 히가시노 게이고 장편소설,833.6-ㅎ961나=7,2022-04-04 12:43


In [8]:
score_result = maek_score(data, dongdea, n=20)

2월 : 0.6192514648372907
3월 : 0.6715690917296608
4월 : 0.6622452679813952
5월 : 0.6020393793561438
6월 : 0.6073227314248151
7월 : 0.5922360546364491
8월 : 0.5967222832398418
9월 : 0.5607355000428706
10월 : 0.5963488559538573
11월 : 0.5781447491207625
12월 : 0.5923366272766477


In [9]:
np.mean(score_result)

0.6071774550545214

# 강동구

In [11]:
def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

def maek_score(data, test_data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")

    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    score_list = []
    for num in range(1, 12):
        similar_titles_list = []
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        similar_titles_list = [item for sublist in similar_titles_list for item in sublist]
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num+1}월 : {result}")
        score_list.append(result)
    
    return score_list

data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)


gangdong = pd.read_csv("./test_data/TEST_GANGDONG.csv")

score_result = maek_score(data, gangdong, n=20)

np.mean(score_result)

2월 : 0.7026851532290873
3월 : 0.6924217399590676
4월 : 0.6354555350483349
5월 : 0.6806042201190383
6월 : 0.6642059588244923
7월 : 0.6711466176333449
8월 : 0.6219613041053543
9월 : 0.6466609295277334
10월 : 0.6845148963499272
11월 : 0.661296544535431
12월 : 0.6180141549177234


0.6617242776590486

# 강남구

In [5]:
def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

def maek_score(data, test_data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")

    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    score_list = []
    for num in range(1, 12):
        similar_titles_list = []
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        similar_titles_list = [item for sublist in similar_titles_list for item in sublist]
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num+1}월 : {result}")
        score_list.append(result)
    
    return score_list

data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)


gangdong = pd.read_csv("./test_data/TEST_GANGNAM.csv")

score_result = maek_score(data, gangdong, n=20)

np.mean(score_result)

2월 : 0.6770875969114531
3월 : 0.7502071727740767
4월 : 0.36305839746643603
5월 : 0.6303979221013379
6월 : 0.5590355651119163
7월 : 0.5661797201286826
8월 : 0.528175645194381
9월 : 0.6325646303965027
10월 : 0.5979484438064724
11월 : 0.572486736758772
12월 : 0.5915265037174929


0.5880607576697748

# 송파구

In [7]:
def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

def maek_score(data, test_data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")

    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    score_list = []
    for num in range(1, 12):
        similar_titles_list = []
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        similar_titles_list = [item for sublist in similar_titles_list for item in sublist]
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num+1}월 : {result}")
        score_list.append(result)
    
    return score_list

data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)


songpa = pd.read_csv("./test_data/TEST_SONGPA.csv")

score_result = maek_score(data, songpa, n=20)

np.mean(score_result)

2월 : 0.6380076936457438
3월 : 0.6230188564432669
4월 : 0.6375407316731632
5월 : 0.6425150750625248
6월 : 0.6929862076995085
7월 : 0.6453832146272175
8월 : 0.6540117477885614
9월 : 0.6646618860070624
10월 : 0.684262618581251
11월 : 0.6914488974970077
12월 : 0.70431880315255


0.661650521107078

In [3]:
def get_top_n_books(book_list, n):
    book_counts = Counter(book_list)
    top_books = book_counts.most_common(n)
    return top_books


def calculate_similarity(a_list, b_list):
    max_similarity_list = []
    
    hannanum = Hannanum()
    a_tokens = [hannanum.morphs(title) for title in a_list]
    vectorizer = CountVectorizer()
    a_vectorized = vectorizer.fit_transform([' '.join(tokens) for tokens in a_tokens])


    for b_title in b_list:

        b_tokens = hannanum.morphs(b_title)


        b_vectorized = vectorizer.transform([' '.join(b_tokens)])
    

        similarity = cosine_similarity(a_vectorized, b_vectorized).max()
        max_similarity_list.append(similarity)

    average_similarity = sum(max_similarity_list) / len(max_similarity_list)
    return average_similarity

def maek_score(data, test_data, n = 20):
    data["KDC_NM"] = data["KDC_NM"].astype("str")

    test_data['LOAN_DATE'] = pd.to_datetime(test_data['LOAN_DATE'])
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].dt.strftime('%m')
    test_data['LOAN_DATE'] = test_data['LOAN_DATE'].astype('int')
    test_data['TITLE_NM'] = test_data['TITLE_NM'].apply(lambda x: re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', x))
    test_data['TITLE_NM'] = test_data['TITLE_NM'].str.strip()
    
    data_1 = data[data["KDC_NM"].str[0] == "1"]
    data_1 = data_1.sort_values("COUNTING", ascending=False)
    data_2 = data[data["KDC_NM"].str[0] == "2"]
    data_2 = data_2.sort_values("COUNTING", ascending=False)
    data_3 = data[data["KDC_NM"].str[0] == "3"]
    data_3 = data_3.sort_values("COUNTING", ascending=False)
    data_4 = data[data["KDC_NM"].str[0] == "4"]
    data_4 = data_4.sort_values("COUNTING", ascending=False)
    data_5 = data[data["KDC_NM"].str[0] == "5"]
    data_5 = data_5.sort_values("COUNTING", ascending=False)
    data_6 = data[data["KDC_NM"].str[0] == "6"]
    data_6 = data_6.sort_values("COUNTING", ascending=False)
    data_7 = data[data["KDC_NM"].str[0] == "7"]
    data_7 = data_7.sort_values("COUNTING", ascending=False)
    data_8 = data[data["KDC_NM"].str[0] == "8"]
    data_8 = data_8.sort_values("COUNTING", ascending=False)
    data_9 = data[data["KDC_NM"].str[0] == "9"]
    data_9 = data_9.sort_values("COUNTING", ascending=False)
    
    hannanum = Hannanum()
    
    tokens1 = [hannanum.morphs(datum) for datum in data_1["TITLE_NM"]]
    tokens2 = [hannanum.morphs(datum) for datum in data_2["TITLE_NM"]]
    tokens3 = [hannanum.morphs(datum) for datum in data_3["TITLE_NM"]]
    tokens4 = [hannanum.morphs(datum) for datum in data_4["TITLE_NM"]]
    tokens5 = [hannanum.morphs(datum) for datum in data_5["TITLE_NM"]]
    tokens6 = [hannanum.morphs(datum) for datum in data_6["TITLE_NM"]]
    tokens7 = [hannanum.morphs(datum) for datum in data_7["TITLE_NM"]]
    tokens8 = [hannanum.morphs(datum) for datum in data_8["TITLE_NM"]]
    tokens9 = [hannanum.morphs(datum) for datum in data_9["TITLE_NM"]]
    
    vectorizer1 = CountVectorizer()
    vectorized_data1 = vectorizer1.fit_transform([' '.join(token) for token in tokens1])
    
    vectorizer2 = CountVectorizer()
    vectorized_data2 = vectorizer2.fit_transform([' '.join(token) for token in tokens2])
    
    vectorizer3 = CountVectorizer()
    vectorized_data3 = vectorizer3.fit_transform([' '.join(token) for token in tokens3])
    
    vectorizer4 = CountVectorizer()
    vectorized_data4 = vectorizer4.fit_transform([' '.join(token) for token in tokens4])
    
    vectorizer5 = CountVectorizer()
    vectorized_data5 = vectorizer5.fit_transform([' '.join(token) for token in tokens5])
    
    vectorizer6 = CountVectorizer()
    vectorized_data6 = vectorizer6.fit_transform([' '.join(token) for token in tokens6])
    
    vectorizer7 = CountVectorizer()
    vectorized_data7 = vectorizer7.fit_transform([' '.join(token) for token in tokens7])
    
    vectorizer8 = CountVectorizer()
    vectorized_data8 = vectorizer8.fit_transform([' '.join(token) for token in tokens8])
    
    vectorizer9 = CountVectorizer()
    vectorized_data9 = vectorizer9.fit_transform([' '.join(token) for token in tokens9])
    
    score_list = []
    for num in range(1, 12):
        similar_titles_list_1 = []
        similar_titles_list_2 = []
        similar_titles_list_3 = []
        similar_titles_list_4 = []
        similar_titles_list_5 = []
        similar_titles_list_6 = []
        similar_titles_list_7 = []
        similar_titles_list_8 = []
        similar_titles_list_9 = []
        
        month_data = test_data[test_data['LOAN_DATE']==num]
        new_title_list = list(month_data['TITLE_NM'])
        kdc_list = list(month_data['KDC_NM'].str[0])
        for kdc, new_title in zip(kdc_list, new_title_list):
            if kdc == "1":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer1.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data1, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_1.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_1.append(similar_titles)
            elif kdc == "2":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer2.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data2, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_2.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_2.append(similar_titles)
            elif kdc == "3":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer3.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data3, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
        
                similar_books = data_3.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_3.append(similar_titles)
            elif kdc == "4":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer4.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data4, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_4.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_4.append(similar_titles)
            elif kdc == "5":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer5.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data5, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_5.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_5.append(similar_titles)
            elif kdc == "6":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer6.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data6, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_6.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_6.append(similar_titles)
            elif kdc == "7":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer7.transform([' '.join(new_title_tokenized)]).toarray()
    
                similarities = cosine_similarity(vectorized_data7, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_7.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_7.append(similar_titles)
            elif kdc == "8":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer8.transform([' '.join(new_title_tokenized)]).toarray()

                similarities = cosine_similarity(vectorized_data8, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_8.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(10)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_8.append(similar_titles)
            elif kdc == "9":
                new_title = new_title
                new_title_tokenized = hannanum.morphs(new_title)
                new_title_vectorized = vectorizer9.transform([' '.join(new_title_tokenized)]).toarray()
        
                similarities = cosine_similarity(vectorized_data9, new_title_vectorized)
                similar_indices = similarities.flatten().argsort()[-n:][::-1]
    
                similar_books = data_9.iloc[similar_indices]
                similar_books = similar_books.sort_values("COUNTING", ascending=False).head(5)
                similar_titles = similar_books["TITLE_NM"].tolist()
                similar_titles_list_9.append(similar_titles)
            else:
                pass
        after_book = list(test_data[test_data['LOAN_DATE']==num+1]['TITLE_NM'])
        
        similar_titles_list_1 = [item for sublist in similar_titles_list_1 for item in sublist]
        similar_titles_list_1 = get_top_n_books(similar_titles_list_1, 5)
        
        similar_titles_list_2 = [item for sublist in similar_titles_list_2 for item in sublist]
        similar_titles_list_2 = get_top_n_books(similar_titles_list_2, 5)
        
        similar_titles_list_3 = [item for sublist in similar_titles_list_3 for item in sublist]
        similar_titles_list_3 = get_top_n_books(similar_titles_list_3, 5)
        
        similar_titles_list_4 = [item for sublist in similar_titles_list_4 for item in sublist]
        similar_titles_list_4 = get_top_n_books(similar_titles_list_4, 5)
        
        similar_titles_list_5 = [item for sublist in similar_titles_list_5 for item in sublist]
        similar_titles_list_5 = get_top_n_books(similar_titles_list_5, 5)
        
        similar_titles_list_6 = [item for sublist in similar_titles_list_6 for item in sublist]
        similar_titles_list_6 = get_top_n_books(similar_titles_list_6, 5)
        
        similar_titles_list_7 = [item for sublist in similar_titles_list_7 for item in sublist]
        similar_titles_list_7 = get_top_n_books(similar_titles_list_7, 5)
        
        similar_titles_list_8 = [item for sublist in similar_titles_list_8 for item in sublist]
        similar_titles_list_8 = get_top_n_books(similar_titles_list_8, 10)
        
        similar_titles_list_9 = [item for sublist in similar_titles_list_9 for item in sublist]
        similar_titles_list_9 = get_top_n_books(similar_titles_list_9, 5)
        
        similar_titles_list = similar_titles_list_1 + similar_titles_list_2 + similar_titles_list_3 + similar_titles_list_4 + similar_titles_list_5 + similar_titles_list_6 + similar_titles_list_7 + similar_titles_list_8 + similar_titles_list_9
        result = calculate_similarity(after_book, similar_titles_list)
        print(f"{num+1}월 : {result}")
        score_list.append(result)
    
    return score_list

data = pd.read_csv("./DataSet_row/BOOK_HIST.csv")
data = make_preprocessing(data)


songpa = pd.read_csv("./test_data/TEST_SONGPA.csv")

score_result = maek_score(data, songpa, n=20)

np.mean(score_result)

AssertionError: phrase input should be string, not <class 'tuple'>

In [10]:
df = pd.read_csv("./test_data/TEST_SONGPA.csv")
df['LOAN_DATE'] = pd.to_datetime(df['LOAN_DATE'])
df['LOAN_DATE'] = df['LOAN_DATE'].dt.strftime('%Y.%m.%d')
df = df.head(50)

In [13]:
for title, kdc, date_num in zip(df['TITLE_NM'], df['KDC_NM'], df['LOAN_DATE']):
    print(f"제목 : {title} / kdc : {kdc} / 날짜 : {date_num}")
    print("")

제목 : 생각이 너무 많은 서른 살에게 : 25년간 세계 최고의 인재들과 일하며 배운 것들 / kdc : 325.211-ㄱ873ㅅ-2 / 날짜 : 2022.01.01

제목 : 나보다 소중한 사람이 생겨버렸다  : 프레드릭 배크만 에세이 / kdc : 859.7-ㅂ682ㄴ / 날짜 : 2022.01.01

제목 : 믿는 만큼 자라는 아이들 : 박혜란의 세 아들 이야기 / kdc : 598.104-ㅂ576ㅁ4 / 날짜 : 2022.01.01

제목 : 트렌드 코리아 2022  : 서울대 소비트렌드분석센터의 2022 전망 / kdc : 320.911-ㅌ94ㅁ-2022 / 날짜 : 2022.01.01

제목 : 파친코. 1 / kdc : 843-ㅇ733ㅍ-1-3 / 날짜 : 2022.01.01

제목 : 일의 격 : 성장하는 나, 성공하는 조직, 성숙한 삶 / kdc : 325.211-ㅅ856ㅇ-2 / 날짜 : 2022.01.01

제목 : 밀레니얼-Z세대 트렌드 2022  : 하나로 정의할 수 없는 MZ세대와 새로운 법칙을 만들어가는 Z세대 / kdc : 331.234-ㄷ51ㅁ-2022 / 날짜 : 2022.01.01

제목 : 일기  : 황정은 에세이 / kdc : 814.7-ㅎ787ㅇ-2 / 날짜 : 2022.01.02

제목 : 매우 예민한 사람들을 위한 책  : 뇌과학과 정신의학이 들려주는 당신 마음에 대한 이야기 / kdc : W화제 182.12-ㅈ336ㅁ-6 / 날짜 : 2022.01.02

제목 : 달러구트 꿈 백화점  : 주문하신 꿈은 매진입니다  : 이미예 장편소설 / kdc : 813.7-ㅇ732ㄷ-3 / 날짜 : 2022.01.02

제목 : 파리에서 도시락을 파는 여자  : 최정상으로 가는 7가지 부의 시크릿 / kdc : 325.211-ㅊ724ㅍ2 / 날짜 : 2022.01.02

제목 : 컨버전스 2030 : 앞으로 10년 우리의 삶은 어떻게 바뀔 것인가?  : 미래의 부와 기회 / kdc : 321.97-ㄷ97ㅋ-2 / 날짜 : 2022

In [18]:
df = pd.read_csv("./NL_CO_LOAN_PUB_202305-6.csv")
df

Unnamed: 0,SEQ_NO,BOOK_KEY_NO,MBER_SEQ_NO_VALUE,LON_DE,RTURN_DE,LBRRY_CD,LON_STLE_NM,RTURN_STLE_NM,RTURN_PREARNGE_DE,RESVE_DE,RESVE_END_DE,LON_STATE_NM,LON_PLACE_NM,RTURN_PLACE_NM,MANAGE_LBRRY_CD,LON_LBRRY_CD,RTURN_LBRRY_CD,MASTR_LBRRY_CD
0,48763587,12110127,4BD3A82F84C8235480772DC14C7F112F6EF0297C,2023-05-20 12:00:00.0,,31112,일반대출,대출중,2023-06-03 12:00:00.0,,,대출중,Eco,,JIN00000,JIN00000,,31100
1,250000171569425,250000131728627,e3f149c80912a4efd0e7402857454fdd9afc8ad6f17cc2...,2021-03-12 12:00:00.0,2023-04-30 12:00:00.0,29902,1,3,2021-03-26 12:00:00.0,,,5,KLAS,Flex_AMH,AB,AB,,29900
2,48763588,12110126,4BD3A82F84C8235480772DC14C7F112F6EF0297C,2023-05-20 12:00:00.0,,31112,일반대출,대출중,2023-06-03 12:00:00.0,,,대출중,Eco,,JIN00000,JIN00000,,31100
3,250000306379574,250000283798976,0d3ad4481254933741ea24b79803c418849e9ddf93faaa...,2021-08-06 12:00:00.0,2023-04-30 12:00:00.0,29902,1,3,2022-04-30 12:00:00.0,,,5,KLAS,Flex_AMH,AB,AB,,29900
4,48763589,12109966,4BD3A82F84C8235480772DC14C7F112F6EF0297C,2023-05-20 12:00:00.0,,31112,일반대출,대출중,2023-06-03 12:00:00.0,,,대출중,Eco,,JIN00000,JIN00000,,31100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873425,128510284,124779702,048ad021a79ed5cfb3012644621f3eb86d2991ca071e89...,2023-04-16 12:00:00.0,2023-05-02 12:00:00.0,33500,0,3.0,2023-05-01 12:00:00.0,,,1,KOLASIII,KOLASIII,MA,,,33500
1873426,128510878,124779944,10e1370f1c79adab65b6428633f7d3474ed7dcdb62592d...,2023-04-16 12:00:00.0,2023-05-02 12:00:00.0,33500,0,0.0,2023-05-01 12:00:00.0,,,1,KOLASIII,KOLASIII,MA,,,33500
1873427,128513546,72521383,6722c8f2dfd0321a1e0a5c09a7ddd0b5c287e58a76af44...,2023-04-17 12:00:00.0,2023-05-01 12:00:00.0,33500,0,0.0,2023-05-08 12:00:00.0,,,1,KOLASIII,KOLASIII,MA,,,33500
1873428,128513548,124822423,6722c8f2dfd0321a1e0a5c09a7ddd0b5c287e58a76af44...,2023-04-17 12:00:00.0,2023-05-01 12:00:00.0,33500,0,0.0,2023-05-08 12:00:00.0,,,1,KOLASIII,KOLASIII,MA,,,33500


In [19]:
[양천구]
2월 : 0.63
3월 : 0.65
4월 : 0.64
5월 : 0.67
6월 : 0.65
7월 : 0.69
8월 : 0.71
9월 : 0.68
10월 : 0.69
11월 : 0.67
12월 : 0.68

양천구 연평균 유산도 분석 결과 : 0.67
전체 데이터 소수점 2자리까리 반올림

SyntaxError: invalid syntax (2675966155.py, line 2)

In [26]:
print("양천구 유사도 분석 결과")
print("데이터 결과 소수점 2자리까지 반올림")
print("-" * 30)
print("2월 유사도 검증 결과 : 0.63")
print("3월 유사도 검증 결과 : 0.65")
print("4월 유사도 검증 결과 : 0.64")
print("5월 유사도 검증 결과 : 0.67")
print("6월 유사도 검증 결과 : 0.65")
print("7월 유사도 검증 결과 : 0.69")
print("8월 유사도 검증 결과 : 0.71")
print("9월 유사도 검증 결과 : 0.68")
print("10월 유사도 검증 결과 : 0.69")
print("11월 유사도 검증 결과 : 0.67")
print("12월 유사도 검증 결과 : 0.68")
print("-" * 30)
print("양천구 연평균 검증 유사도 결과 : 0.67")

양천구 유사도 분석 결과
데이터 결과 소수점 2자리까지 반올림
------------------------------
2월 유사도 검증 결과 : 0.63
3월 유사도 검증 결과 : 0.65
4월 유사도 검증 결과 : 0.64
5월 유사도 검증 결과 : 0.67
6월 유사도 검증 결과 : 0.65
7월 유사도 검증 결과 : 0.69
8월 유사도 검증 결과 : 0.71
9월 유사도 검증 결과 : 0.68
10월 유사도 검증 결과 : 0.69
11월 유사도 검증 결과 : 0.67
12월 유사도 검증 결과 : 0.68
------------------------------
양천구 연평균 검증 유사도 결과 : 0.67


In [27]:
word = "path = "./drive/MyDrive/INFO_BOOK_FINAL.csv\
\
\
chunk_size = 3000000\
\
for cnt, chunk in enumerate(pd.read_csv(path, chunksize=chunk_size)):\
  chunk = chunk[['CTRL_NO', "LBRRY_CD", "TITLE_NM", "KDC_NM", "ISBN_THIRTEEN_ORGT_NO", "CL_SMBL_NO"]]\
  chunk.to_csv(f"./drive/MyDrive/df_{cnt}.csv")"

SyntaxError: invalid syntax (1359614655.py, line 1)

In [None]:
path = "./drive/MyDrive/INFO_BOOK_FINAL.csv"


chunk_size = 3000000

for cnt, chunk in enumerate(pd.read_csv(path, chunksize=chunk_size)):
  chunk = chunk[['CTRL_NO', "LBRRY_CD", "TITLE_NM", "KDC_NM", "ISBN_THIRTEEN_ORGT_NO", "CL_SMBL_NO"]]
  chunk.to_csv(f"./drive/MyDrive/df_{cnt}.csv")

In [31]:
word = 'for cnt, chunk in enumerate(pd.read_csv(path, chunksize=chunk_size)):'
word.upper()

'FOR CNT, CHUNK IN ENUMERATE(PD.READ_CSV(PATH, CHUNKSIZE=CHUNK_SIZE)):'