In [73]:
import re
import os
import json

from pathlib import Path
from datetime import date, datetime, timedelta


import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity


import ast

from copy import deepcopy

In [2]:
df_start = pd.read_csv('datasets/books_with_genre.csv')
df_start.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,# num_pages,ratings_count,text_reviews_count,genre
0,792,london city guide lonely planet city guide,"['lonely planet', 'sarah johnstone', 'tom mast...",4.03,1740598318,9781740598316,eng,466,38,5,['travel']
1,797,lonely planet londres,"['lonely planet', 'sarah johnstone', 'tom mast...",4.03,8408064762,9788408064763,spa,480,0,0,['travel']
2,815,three nights august strategy heartbreak joy in...,['bissinger'],3.87,618710531,9780618710539,eng,287,6560,235,['sports recreation']
3,828,interface,"['neal stephenson', 'george jewsbury', 'stephe...",3.67,553383434,9780553383430,eng,640,4572,259,['fiction']
4,830,snow crash,['neal stephenson'],4.03,553380958,9780553380958,eng,438,187493,6601,['fiction']


In [3]:
pattern = re.compile(r'\s+')

def clean_list_text(list_string):
    try:
        list_of_author = ast.literal_eval(list_string)
        rem_space_list = [re.sub(pattern, '', value) for value in list_of_author]
        return " ".join(rem_space_list)
    except:
        return ""

In [4]:
def brief_genre(list_genre):
    try:
        list_of_genre = ast.literal_eval(list_genre)
        all_genre = " ".join(list_of_genre)
        split_genre = list(set(all_genre.split()))
        
        return " ".join(split_genre)
    except:
        return ""

In [5]:
# joined_aut = 
# df_start['authors'].apply(clean_list_text)
df_start['author_info'] = df_start['authors'].apply(clean_list_text)


In [6]:
df_start['language'] = df_start['language_code'].apply(lambda x: "other" if x in ['mul', 'en-CA', 'fre',
       'por', 'zho', 'ger', 'lat', 'rus', 'grc'] else x.lower())

In [7]:
df_start['language'].value_counts()

eng      1518
en-us     258
en-gb      51
spa        42
other      29
jpn        10
Name: language, dtype: int64

In [8]:
df_start['type'] = df_start['genre'].apply(brief_genre)

In [9]:
df_start['type'].value_counts()

fiction                             709
juvenile fiction                    123
autobiography biography              96
history                              68
drama                                48
                                   ... 
childbirth                            1
byzantine empire fantasy fiction      1
norwegian fiction                     1
dragons                               1
war 1914 world 1918                   1
Name: type, Length: 344, dtype: int64

In [10]:
df_soup = deepcopy(df_start)

In [12]:
df_soup.drop(columns=[ 'authors', 'isbn', 'isbn13', 'language_code', 'genre', ], inplace=True)

In [14]:
# df_soup['meta_info'] = 
# df_soup['title'], df_soup['author_info'], df_soup['type']

cols = ['title', 'author_info', 'type']
df_soup['meta_info'] = df_soup[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [16]:
df_soup.drop(columns=cols, inplace=True)

In [20]:
df_meta = pd.get_dummies(df_soup, columns=['language'])

In [24]:
df_meta.rename(columns={"# num_pages": "page_num"}, inplace=True)

In [42]:
df_meta.head()

Unnamed: 0_level_0,average_rating,page_num,ratings_count,text_reviews_count,meta_info,language_en-gb,language_en-us,language_eng,language_jpn,language_other,language_spa
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
london city guide lonely planet city guide,0.287373,0.520801,-0.220259,-0.269408,london city guide lonely planet city guide lon...,0,0,1,0,0,0
lonely planet londres,0.287373,0.584289,-0.220817,-0.272271,lonely planet londres lonelyplanet sarahjohnst...,0,0,0,0,0,1
three nights august strategy heartbreak joy inside mind manager,-0.181642,-0.290943,-0.124552,-0.137702,three nights august strategy heartbreak joy in...,0,0,1,0,0,0
interface,-0.76791,1.30987,-0.153725,-0.123958,interface nealstephenson georgejewsbury stephe...,0,0,1,0,0,0
snow crash,0.287373,0.393824,2.530544,3.507692,snow crash nealstephenson fiction,0,0,1,0,0,0


In [27]:
df_meta['title'] = df_start['title']

In [41]:
df_meta.set_index("title", inplace=True)
# df_meta.drop(columns=["bookID"], inplace=True)

In [39]:
num_cols = ['average_rating', 'page_num', 'ratings_count', 'text_reviews_count',]
df_meta[num_cols] = StandardScaler().fit_transform(df_meta[num_cols])

In [44]:
tf = TfidfVectorizer(ngram_range=(1,6), min_df = 0, stop_words = 'english', sublinear_tf=True)
tfidf_matrix = tf.fit_transform(df_meta['meta_info'])

tfidf_matrix.shape

(1908, 40908)

In [45]:
df_meta_vect = pd.DataFrame(tfidf_matrix.toarray(), columns=tf.get_feature_names())

In [61]:
df_meta_vect.set_index(df_start.title, inplace=True)

In [63]:
df_cos = pd.concat([df_meta, df_meta_vect], axis=1, )

In [69]:
df_cos.drop(columns=['meta_info'], inplace=True)

In [72]:
df_cos.apply(sum)

average_rating                                           -4.083400e-13
page_num                                                 -1.131317e-13
ratings_count                                             4.421463e-14
text_reviews_count                                        3.563816e-14
language_en-gb                                            5.100000e+01
                                                              ...     
zur genealogie der moral friedrichnietzsche               2.207019e-01
zur genealogie der moral friedrichnietzsche asceticism    2.207019e-01
zyberk                                                    1.342987e-01
zyberk jeffspeck                                          1.342987e-01
zyberk jeffspeck architecture                             1.342987e-01
Length: 40918, dtype: float64

In [74]:
cosine_sim = cosine_similarity(df_cos.to_numpy(), df_cos.to_numpy())

In [75]:
cosine_sim

array([[ 1.        ,  0.32884527,  0.37298748, ...,  0.53439953,
         0.02898007, -0.01865784],
       [ 0.32884527,  1.        , -0.06714475, ...,  0.10248781,
         0.02142254, -0.03034492],
       [ 0.37298748, -0.06714475,  1.        , ...,  0.4474616 ,
         0.03323951,  0.05826022],
       ...,
       [ 0.53439953,  0.10248781,  0.4474616 , ...,  1.        ,
         0.03741993,  0.01645337],
       [ 0.02898007,  0.02142254,  0.03323951, ...,  0.03741993,
         1.        ,  0.5725183 ],
       [-0.01865784, -0.03034492,  0.05826022, ...,  0.01645337,
         0.5725183 ,  1.        ]])

In [97]:
def content_recommender(title, cosine_sim, df, indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    
    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    print("idx data: ", type(idx))
    print("SIM SCORE: ", sim_scores)

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]


In [78]:
df_start = df_start.reset_index()
indices2 = pd.Series(df_start.index, index=df_start['title'])

In [98]:
content_recommender('iliad', 
                    cosine_sim,
                    df_start,
                    indices2)

idx data:  <class 'numpy.int64'>
SIM SCORE:  [(0, 0.06198793959176589), (1, -0.06241048133043024), (2, -0.012674165828473525), (3, 0.2464402972899271), (4, 0.7891520535672704), (5, 0.3201904456197023), (6, 0.880329333227441), (7, -0.17023700413429518), (8, 0.20720474991426793), (9, -0.21674012246629656), (10, -0.17708387156634997), (11, 0.06468561209350418), (12, -0.014042245077690944), (13, -0.08199313845014976), (14, -0.11006114048066565), (15, -0.039506552100465664), (16, 0.11992502200908574), (17, -0.06941355631690314), (18, -0.21125708488173253), (19, 0.6867055995332146), (20, -0.1878929780771618), (21, -0.1642224837275019), (22, -0.19050942143342212), (23, 0.5585807562341087), (24, 0.22814552882628017), (25, -0.03889018193227758), (26, 0.16066267361421352), (27, 0.13373130499039784), (28, 0.10603241765268442), (29, -0.04434914445755397), (30, -0.2160339870700706), (31, 0.17898535154161038), (32, -0.12399359970410341), (33, -0.22868256643187032), (34, 0.23572674154121198), (35, 1.

575     chronicles narnia chronicles narnia
515       confessions shopaholic shopaholic
1344                          pelican brief
380             jurassic park jurassic park
481                                 macbeth
6                                  mice men
730                                 othello
1082             red dragon hannibal lecter
1738                            holes holes
995                 left behind left behind
Name: title, dtype: object