In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

rating_data = pd.read_csv("https://raw.githubusercontent.com/Bangkit-Booker-Project/MachineLearning/main/Dataset/rating_dataset.csv")
book_data = pd.read_csv("https://raw.githubusercontent.com/Bangkit-Booker-Project/MachineLearning/main/Dataset/book_dataset.csv")

In [30]:
book_data['genre'] = book_data['bookGenre1'] + ',' + book_data['bookGenre2'] + ',' + book_data['bookGenre3']
book_data_filtered = book_data[['ISBN', 'bookTitle', 'bookAuthor', 'bookGenre1', 'bookGenre2', 'bookGenre3', 'genre']]


In [31]:
book_data_filtered.head(5)

Unnamed: 0,ISBN,bookTitle,bookAuthor,bookGenre1,bookGenre2,bookGenre3,genre
0,446310786,To Kill a Mockingbird,Harper Lee,Classics,Fiction,Historical-HistoricalFiction,"Classics,Fiction,Historical-HistoricalFiction"
1,055321215X,Pride and Prejudice,Jane Austen,Classics,Fiction,Romance,"Classics,Fiction,Romance"
2,451526341,Animal Farm,George Orwell,Classics,Fiction,ScienceFiction-Dystopia,"Classics,Fiction,ScienceFiction-Dystopia"
3,446365386,Gone with the Wind,Margaret Mitchell,Classics,Historical-HistoricalFiction,Fiction,"Classics,Historical-HistoricalFiction,Fiction"
4,60256664,The Giving Tree,Shel Silverstein,Childrens,Childrens-PictureBooks,Classics,"Childrens,Childrens-PictureBooks,Classics"


In [14]:
book_data_filtered['bookGenre1'][0:].count

<bound method Series.count of 0        Classics
1        Classics
2        Classics
3        Classics
4       Childrens
          ...    
1531      Fantasy
1532      Fiction
1533      Fiction
1534    Christian
1535      History
Name: bookGenre1, Length: 1536, dtype: object>

In [32]:
 # Inisialisasi TfidfVectorizer
tf = TfidfVectorizer()
 
# Melakukan perhitungan idf pada data cuisine
tf.fit(book_data_filtered['genre']) 
 
# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names() 



['academic',
 'action',
 'adultfiction',
 'adventure',
 'africa',
 'africanamerican',
 'albanianliterature',
 'alternatehistory',
 'american',
 'americanhistory',
 'angels',
 'animals',
 'anthropology',
 'apocalyptic',
 'art',
 'arthurian',
 'asia',
 'asianliterature',
 'astronomy',
 'australia',
 'autobiography',
 'baseball',
 'biography',
 'biology',
 'bookclub',
 'booksaboutbooks',
 'brazil',
 'britishliterature',
 'business',
 'canada',
 'chicklit',
 'childrens',
 'china',
 'christian',
 'christianfiction',
 'christianity',
 'christianliving',
 'christmas',
 'civilwar',
 'classics',
 'comedy',
 'comics',
 'comingofage',
 'contemporary',
 'contemporaryromance',
 'crime',
 'cultural',
 'cyberpunk',
 'darkfantasy',
 'dragons',
 'drama',
 'drawing',
 'dystopia',
 'economics',
 'environment',
 'epicfantasy',
 'espionage',
 'essays',
 'europeanliterature',
 'evolution',
 'fae',
 'fairies',
 'fairytales',
 'fantasy',
 'feminism',
 'fiction',
 'finance',
 'food',
 'foodanddrink',
 'footbal

In [33]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(book_data_filtered['genre']) 
 
# Melihat ukuran matrix tfidf
tfidf_matrix.shape 

(1536, 181)

In [34]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.46213191],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [35]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan jenis masakan
# Baris diisi dengan nama resto
 
pd.DataFrame(
    tfidf_matrix.todense(), 
    columns=tf.get_feature_names(),
    index=book_data_filtered['bookTitle']
).sample(22, axis=1).sample(10, axis=0)



Unnamed: 0_level_0,comedy,cultural,poetry,school,ldsfiction,asia,health,alternatehistory,postapocalyptic,football,...,christianliving,mystery,italy,britishliterature,southerngothic,science,westerns,historicalromance,romanticsuspense,selfhelp
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Last of the Really Great Whangdoodles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Strangers on a Train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.661273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Old Yeller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Eye of the Needle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Nine Stories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
His Dark Materials,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Can You Keep a Secret?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The French Lieutenant's Woman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Stormbreaker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Diary of a Drug Fiend,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix) 
cosine_sim

array([[1.        , 0.4144056 , 0.26230236, ..., 0.05115634, 0.        ,
        0.        ],
       [0.4144056 , 1.        , 0.28344805, ..., 0.05528035, 0.        ,
        0.        ],
       [0.26230236, 0.28344805, 1.        , ..., 0.03499027, 0.        ,
        0.        ],
       ...,
       [0.05115634, 0.05528035, 0.03499027, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [37]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama resto
cosine_sim_df = pd.DataFrame(cosine_sim, index=book_data_filtered['bookTitle'], columns=book_data_filtered['bookTitle'])
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.to_csv('cosine.csv')
# Melihat similarity matrix pada setiap resto
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (1536, 1536)


bookTitle,Kushiel's Dart,Fever Pitch,Nights at the Circus,"I, Claudius",My Side of the Mountain
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Matarese Circle,0.07493,0.022649,0.056327,0.070547,0.078473
Island of the Sequined Love Nun,0.404677,0.021337,0.304207,0.06646,0.073927
Salem Falls,0.048549,0.014675,0.036495,0.045709,0.050844
The Cay,0.072617,0.02195,0.054588,0.652648,0.523787
The Two Princesses of Bamarre,0.479768,0.025297,0.360655,0.078793,0.603636
Three Men in a Boat,0.071508,0.021615,0.053754,0.357889,0.398094
The Richest Man in Babylon,0.0,0.0,0.0,0.0,0.0
The Snow Leopard,0.0,0.0,0.0,0.0,0.0
Public Secrets,0.558176,0.0,0.0,0.0,0.0
A Scanner Darkly,0.042466,0.012836,0.031923,0.039982,0.044474


In [38]:
def books_recommendations(bookTitle, similarity_data=cosine_sim_df, items=book_data_filtered[['bookTitle', 'genre', 'ISBN']], k=10):
    """
    Rekomendasi Resto berdasarkan kemiripan dataframe
 
    Parameter:
    ---
    nama_resto : tipe data string (str)
                Nama Restoran (index kemiripan dataframe)
    similarity_data : tipe data pd.DataFrame (object)
                      Kesamaan dataframe, simetrik, dengan resto sebagai 
                      indeks dan kolom
    items : tipe data pd.DataFrame (object)
            Mengandung kedua nama dan fitur lainnya yang digunakan untuk mendefinisikan kemiripan
    k : tipe data integer (int)
        Banyaknya jumlah rekomendasi yang diberikan
    ---
 
 
    Pada index ini, kita mengambil k dengan nilai similarity terbesar 
    pada index matrix yang diberikan (i).
    """
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,bookTitle].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop nama_resto agar nama resto yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(bookTitle, errors='ignore')
 
    return pd.DataFrame(closest).merge(items).head(k)

In [39]:
book_data_filtered[book_data_filtered.bookTitle.eq('The Sight')]

Unnamed: 0,ISBN,bookTitle,bookAuthor,bookGenre1,bookGenre2,bookGenre3,genre
388,014250047X,The Sight,David Clement-Davies,Fantasy,YoungAdult,Animals,"Fantasy,YoungAdult,Animals"


In [40]:
books_recommendations('The Sight')

Unnamed: 0,bookTitle,genre,ISBN
0,Fire Bringer,"Fantasy,YoungAdult,Animals",142300608
1,Tailchaser's Song,"Fantasy,Fiction,Animals",886773741
2,The Horse Whisperer,"Fiction,Romance,Animals",440222656
3,The Loop,"Fiction,Romance,Animals",440224624
4,War Horse,"Historical-HistoricalFiction,Fiction,Animals",749704454
5,Modoc: The True Story of the Greatest Elephant...,"Nonfiction,Animals,Biography",60182571
6,The Lord God Made Them All,"Nonfiction,Animals,Biography",312498349
7,Animal Liberation,"Philosophy,Nonfiction,Animals",380713330
8,Many Waters,"Fantasy,YoungAdult,Fiction",440405483
9,Abhorsen,"Fantasy,YoungAdult,Fiction",60278269
