In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

rating_data = pd.read_csv("https://raw.githubusercontent.com/Bangkit-Booker-Project/MachineLearning/main/Dataset/rating_dataset.csv")
book_data = pd.read_csv("https://raw.githubusercontent.com/Bangkit-Booker-Project/MachineLearning/main/Dataset/book_dataset.csv")

In [2]:
book_data['genre'] = book_data['bookGenre1'] + ',' + book_data['bookGenre2'] + ',' + book_data['bookGenre3']
book_data_filtered = book_data[['ISBN', 'bookTitle', 'bookAuthor', 'bookGenre1', 'bookGenre2', 'bookGenre3', 'genre']]


In [3]:
book_data_filtered.head(5)

Unnamed: 0,ISBN,bookTitle,bookAuthor,bookGenre1,bookGenre2,bookGenre3,genre
0,446310786,To Kill a Mockingbird,Harper Lee,Classics,Fiction,Historical-HistoricalFiction,"Classics,Fiction,Historical-HistoricalFiction"
1,055321215X,Pride and Prejudice,Jane Austen,Classics,Fiction,Romance,"Classics,Fiction,Romance"
2,451526341,Animal Farm,George Orwell,Classics,Fiction,ScienceFiction-Dystopia,"Classics,Fiction,ScienceFiction-Dystopia"
3,446365386,Gone with the Wind,Margaret Mitchell,Classics,Historical-HistoricalFiction,Fiction,"Classics,Historical-HistoricalFiction,Fiction"
4,60256664,The Giving Tree,Shel Silverstein,Childrens,Childrens-PictureBooks,Classics,"Childrens,Childrens-PictureBooks,Classics"


In [4]:
book_data_filtered['bookGenre1'][0:].count

<bound method Series.count of 0        Classics
1        Classics
2        Classics
3        Classics
4       Childrens
          ...    
1531      Fantasy
1532      Fiction
1533      Fiction
1534    Christian
1535      History
Name: bookGenre1, Length: 1536, dtype: object>

In [5]:
 # Inisialisasi TfidfVectorizer
tf = TfidfVectorizer()
 
# Melakukan perhitungan idf pada data genre
tf.fit(book_data_filtered['genre']) 
 
# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names() 



['academic',
 'action',
 'adultfiction',
 'adventure',
 'africa',
 'africanamerican',
 'albanianliterature',
 'alternatehistory',
 'american',
 'americanhistory',
 'angels',
 'animals',
 'anthropology',
 'apocalyptic',
 'art',
 'arthurian',
 'asia',
 'asianliterature',
 'astronomy',
 'australia',
 'autobiography',
 'baseball',
 'biography',
 'biology',
 'bookclub',
 'booksaboutbooks',
 'brazil',
 'britishliterature',
 'business',
 'canada',
 'chicklit',
 'childrens',
 'china',
 'christian',
 'christianfiction',
 'christianity',
 'christianliving',
 'christmas',
 'civilwar',
 'classics',
 'comedy',
 'comics',
 'comingofage',
 'contemporary',
 'contemporaryromance',
 'crime',
 'cultural',
 'cyberpunk',
 'darkfantasy',
 'dragons',
 'drama',
 'drawing',
 'dystopia',
 'economics',
 'environment',
 'epicfantasy',
 'espionage',
 'essays',
 'europeanliterature',
 'evolution',
 'fae',
 'fairies',
 'fairytales',
 'fantasy',
 'feminism',
 'fiction',
 'finance',
 'food',
 'foodanddrink',
 'footbal

In [6]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(book_data_filtered['genre']) 
 
# Melihat ukuran matrix tfidf
tfidf_matrix.shape 

(1536, 181)

In [7]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.46213191],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [None]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan genre buku
# Baris diisi dengan nama buku
 
pd.DataFrame(
    tfidf_matrix.todense(), 
    columns=tf.get_feature_names(),
    index=book_data_filtered['bookTitle']
).sample(22, axis=1).sample(10, axis=0)



Unnamed: 0_level_0,westerns,humor,bookclub,fantasy,presidents,christianity,crime,adventure,selfhelp,cultural,...,newyork,pirates,lgbt,race,language,ldsfiction,lds,magic,fairies,chicklit
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Art of Seduction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.677369,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The First Man in Rome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Looking for Alibrandi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z for Zachariah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Birdsong: A Novel of Love and War,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Blind Watchmaker: Why the Evidence of Evolution Reveals a Universe Without Design,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
If You Really Loved Me,0.0,0.0,0.0,0.0,0.0,0.0,0.77025,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rifles for Watie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Hotel New Hampshire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix) 
cosine_sim

array([[1.        , 0.4144056 , 0.26230236, ..., 0.05115634, 0.        ,
        0.        ],
       [0.4144056 , 1.        , 0.28344805, ..., 0.05528035, 0.        ,
        0.        ],
       [0.26230236, 0.28344805, 1.        , ..., 0.03499027, 0.        ,
        0.        ],
       ...,
       [0.05115634, 0.05528035, 0.03499027, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [None]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama buku
cosine_sim_df = pd.DataFrame(cosine_sim, index=book_data_filtered['bookTitle'], columns=book_data_filtered['bookTitle'])
print('Shape:', cosine_sim_df.shape)
#Menyimpan cosine sim dalam bentuk csv
cosine_sim_df.to_csv('cosine.csv')
# Melihat similarity matrix pada setiap buku
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (1536, 1536)


bookTitle,The Blank Slate: The Modern Denial of Human Nature,The Famished Road,The Little House,The Bloody Chamber and Other Stories,The Armageddon Rag
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Dark Half,0.0,0.034647,0.029013,0.053773,0.646074
Perelandra,0.0,0.041765,0.034973,0.371608,0.4145
The Ransom of Red Chief,0.0,0.033768,0.028277,0.762942,0.058459
Sister Carrie,0.0,0.037368,0.031292,0.057996,0.064691
A Handful of Dust,0.0,0.037368,0.031292,0.057996,0.064691
Job: A Comedy of Justice,0.0,0.041765,0.034973,0.371608,0.4145
Taran Wanderer,0.0,0.043035,0.036037,0.38291,0.427107
"Life, the Universe and Everything",0.0,0.034383,0.028791,0.053363,0.059522
The Man Who Loved Clowns,0.0,0.027941,0.023397,0.043365,0.04837
The Assassination of Jesse James by the Coward Robert Ford,0.0,0.025874,0.021666,0.040156,0.044791


In [None]:
#Membaca data pada cosine sim
data = pd.read_csv('https://raw.githubusercontent.com/Bangkit-Booker-Project/MachineLearning/main/Content_Based/cosine.csv')
data = data.set_index('bookTitle')
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

bookTitle,To Say Nothing of the Dog,A Painted House,Birdy,Prodigal Summer,The Pilgrim's Progress
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Night Probe!,0.030994,0.052871,0.036231,0.028989,0.045365
Mother,0.02062,0.035175,0.128132,0.019286,0.160436
The Wishsong of Shannara,0.022345,0.038117,0.026121,0.0209,0.032706
Keeping the Moon,0.0,0.0,0.0,0.244616,0.0
The Secret,0.0,0.0,0.0,0.0,0.0
Born in Fire,0.0,0.0,0.0,0.15359,0.0
The Immortal Highlander,0.0,0.0,0.0,0.0,0.0
The Amityville Horror,0.02675,0.045632,0.03127,0.02502,0.039154
A Thousand Acres,0.029088,0.04962,0.034003,0.246189,0.042576
Life After God,0.022903,0.039068,0.026772,0.021421,0.033522


In [None]:
#Fungsi untuk melakukan rekomendasi berbasis konten(genre)
def books_recommendations(bookTitle, similarity_data=cosine_sim_df, k=10):
    """
    Rekomendasi Resto berdasarkan kemiripan dataframe
 
    Parameter:
    ---
    book_title : tipe data string (str)
                Nama Buku (index kemiripan dataframe)
    similarity_data : tipe data pd.DataFrame (object)
                      Kesamaan dataframe, simetrik, dengan buku sebagai 
                      indeks dan kolom
    k : tipe data integer (int)
        Banyaknya jumlah rekomendasi yang diberikan
    ---
 
 
    Pada index ini, kita mengambil k dengan nilai similarity terbesar 
    pada index matrix yang diberikan (i).
    """
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,bookTitle].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop book_title agar nama buku yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(bookTitle, errors='ignore')
 
    return pd.DataFrame(closest).head(k)

In [None]:
book_data_filtered[book_data_filtered.bookTitle.eq('The Sight')]

Unnamed: 0,ISBN,bookTitle,bookAuthor,bookGenre1,bookGenre2,bookGenre3,genre
388,014250047X,The Sight,David Clement-Davies,Fantasy,YoungAdult,Animals,"Fantasy,YoungAdult,Animals"


In [None]:
books_recommendations('The Sight')

Unnamed: 0,bookTitle
0,Fire Bringer
1,Tailchaser's Song
2,The Horse Whisperer
3,The Loop
4,War Horse
5,Modoc: The True Story of the Greatest Elephant...
6,The Lord God Made Them All
7,Animal Liberation
8,Many Waters
9,Abhorsen
