# **Preparing Dataset from Kaggle**

In [1]:
# Instal library Kaggle
!pip install -q kaggle

In [2]:
# Membuat direktori file konfigurasi Kaggle
!mkdir ~/.kaggle

In [3]:
# Salin file API Kaggle 'kaggle.json' ke direktori konfigurasi
!cp kaggle.json ~/.kaggle/

cp: cannot stat 'kaggle.json': No such file or directory


In [4]:
# Mengatur izin file 'kaggle.json' agar hanya dapat diakses oleh pemilik
!chmod 600 ~/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [5]:
# Unduh dataset menggunakan Kaggle API
!kaggle datasets download -d 'mohamedbakhet/amazon-books-reviews'

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 98% 1.04G/1.06G [00:07<00:00, 85.3MB/s]
100% 1.06G/1.06G [00:07<00:00, 143MB/s] 


# **Import Library**

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential  # Correct import
from pathlib import Path

# **Dataset Extraction & Load**

In [7]:
# Ekstrak file zip yang di unduh
dataset_zip = zipfile.ZipFile('amazon-books-reviews.zip','r')
dataset_zip.extractall()
dataset_zip.close()

In [8]:
# Load dataset books_data.csv
books_df = pd.read_csv('books_data.csv')
# Membaca file "Books_data.csv" dan mengonversinya ke DataFrame "ratings_df"
books_df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [9]:
# Load dataset books_rating.csv
ratings_df = pd.read_csv('Books_rating.csv')
# Membaca file "Books_rating.csv" dan mengonversinya ke DataFrame "ratings_df"
ratings_df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


# **Number of rows and columns**

In [10]:
# Mengembalikan jumlah baris dan kolom pada DataFrame "books_df"
books_df.shape

(212404, 10)

In [11]:
# Mengembalikan jumlah baris dan kolom pada DataFrame "ratings_df"
ratings_df.shape

(3000000, 10)

# **Descriptive Statistics Summary**

In [12]:
# Membuat ringkasan statistik deskriptif dari DataFrame "books_df"
books_df.describe(include='all')
# count = jumlah nilai non-missing

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
count,212403,143962,180991,160329,188568,136518,187099.0,188568,171205,49752.0
unique,212403,133226,127278,149387,188099,16016,11582.0,184506,10883,
top,Student's Solutions Manual for Johnson/Mowry's...,Unlike some other reproductions of classic tex...,['Rose Arny'],http://books.google.com/books/content?id=7dMSA...,http://books.google.com/books?id=acwPAgAAQBAJ&...,Simon and Schuster,2000.0,https://play.google.com/store/books/details?id...,['Fiction'],
freq,1,92,236,79,17,3454,4286.0,18,23419,
mean,,,,,,,,,,21.252975
std,,,,,,,,,,201.340431
min,,,,,,,,,,1.0
25%,,,,,,,,,,1.0
50%,,,,,,,,,,2.0
75%,,,,,,,,,,5.0


In [13]:
# Membuat ringkasan statistik deskriptif dari DataFrame "ratings_df"
ratings_df.describe(include='all')

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
count,3000000,2999792,481171.0,2438213,2438095,3000000,3000000.0,3000000.0,2999593,2999992
unique,221998,212403,,1008972,854145,12084,,,1592314,2062648
top,B000IEZE3G,The Hobbit,,A14OJS0VWMOSWO,Midwest Book Review,0/0,,,Great Book,digital books are perfect and easy to use! The...
freq,6796,22023,,5795,5817,885732,,,6848,322
mean,,,21.762656,,,,4.215289,1132307000.0,,
std,,,26.206541,,,,1.203054,149320200.0,,
min,,,1.0,,,,1.0,-1.0,,
25%,,,10.78,,,,4.0,999907200.0,,
50%,,,14.93,,,,5.0,1128298000.0,,
75%,,,23.95,,,,5.0,1269130000.0,,


In [14]:
# Menampilkan ringkasan struktur dan isi DataFrame
print("Dataset Books")
books_df.info()

print("\nDataset Ratings")
ratings_df.info()

Dataset Books
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB

Dataset Ratings
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price          

# **Missing Values ​​(Not a Number)**

In [15]:
# Menghitung jumlah NaN pada DataFrame "books_df"
books_df.isna().sum()

Unnamed: 0,0
Title,1
description,68442
authors,31413
image,52075
previewLink,23836
publisher,75886
publishedDate,25305
infoLink,23836
categories,41199
ratingsCount,162652


In [16]:
# Menghitung jumlah NaN pada DataFrame "ratings_df"
ratings_df.isna().sum()

Unnamed: 0,0
Id,0
Title,208
Price,2518829
User_id,561787
profileName,561905
review/helpfulness,0
review/score,0
review/time,0
review/summary,407
review/text,8


In [17]:
# Mencetak persentase missing value
def categorize_missing_percentage(df, dataset_name):
    total_rows = len(df)
    missing_percentage = (df.isnull().sum() / total_rows) * 100

    print(f"\nMissing Value Dataset {dataset_name}")

    categories = {
        "Rendah(0-20%)": missing_percentage[(missing_percentage >= 0) & (missing_percentage <= 20)],
        "Sedang (21-50%)": missing_percentage[(missing_percentage > 20) & (missing_percentage <= 50)],
        "Tinggi (>50%)": missing_percentage[(missing_percentage > 50)],
    }

    for category, values in categories.items():
        print(f"\n{category}:")
        if values.empty:
            print("Tidak ada kolom dengan missing value pada rentang ini")
        else:
            print(values)

categorize_missing_percentage(books_df, "books")
categorize_missing_percentage(ratings_df, "ratings")


Missing Value Dataset books

Rendah(0-20%):
Title             0.000471
authors          14.789270
previewLink      11.222011
publishedDate    11.913617
infoLink         11.222011
categories       19.396527
dtype: float64

Sedang (21-50%):
description    32.222557
image          24.516958
publisher      35.727199
dtype: float64

Tinggi (>50%):
ratingsCount    76.576712
dtype: float64

Missing Value Dataset ratings

Rendah(0-20%):
Id                     0.000000
Title                  0.006933
User_id               18.726233
profileName           18.730167
review/helpfulness     0.000000
review/score           0.000000
review/time            0.000000
review/summary         0.013567
review/text            0.000267
dtype: float64

Sedang (21-50%):
Tidak ada kolom dengan missing value pada rentang ini

Tinggi (>50%):
Price    83.960967
dtype: float64


# **Data Duplicate**

In [18]:
# Mengidentifikasi dan menghitung baris duplikat dalam dataset
print("Duplikasi pada dataset Book :", books_df.duplicated().sum())
print("Duplikasi pada dataset Rating :", ratings_df.duplicated().sum())

Duplikasi pada dataset Book : 0
Duplikasi pada dataset Rating : 8774


In [19]:
# Hapus baris duplikat pada DataFrame ratings_df
ratings_df = ratings_df.drop_duplicates()
# Tampilkan DataFrame ratings_df setelah penghapusan duplikat
print(ratings_df)

                 Id                           Title  Price         User_id  \
0        1882931173  Its Only Art If Its Well Hung!    NaN   AVCGYZL8FQQTD   
1        0826414346        Dr. Seuss: American Icon    NaN  A30TK6U7DNS82R   
2        0826414346        Dr. Seuss: American Icon    NaN  A3UH4UZ4RSVO82   
3        0826414346        Dr. Seuss: American Icon    NaN  A2MVUWT453QH61   
4        0826414346        Dr. Seuss: American Icon    NaN  A22X4XUPKF66MR   
...             ...                             ...    ...             ...   
2999995  B000NSLVCU             The Idea of History    NaN             NaN   
2999996  B000NSLVCU             The Idea of History    NaN  A1SMUB9ASL5L9Y   
2999997  B000NSLVCU             The Idea of History    NaN  A2AQMEKZKK5EE4   
2999998  B000NSLVCU             The Idea of History    NaN  A18SQGYBKS852K   
2999999  B000NSLVCU             The Idea of History    NaN             NaN   

                                profileName review/helpfulness 

In [20]:
# Drop kolom yang tidak diperlukan pada dataset Book
books_df = books_df.drop(columns=['previewLink', 'infoLink', 'ratingsCount'])
books_df.head()

Unnamed: 0,Title,description,authors,image,publisher,publishedDate,categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,,1996,['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,,2000,['Religion']
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,iUniverse,2005-02,['Fiction']
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,,2003-03-01,


In [21]:
# Drop kolom yang tidak diperlukan pada dataset Rating
ratings_df = ratings_df.drop(columns=['Price', 'review/helpfulness', 'review/time', 'review/summary', 'review/text'])
ratings_df.head()

Unnamed: 0,Id,Title,User_id,profileName,review/score
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",4.0
1,826414346,Dr. Seuss: American Icon,A30TK6U7DNS82R,Kevin Killian,5.0
2,826414346,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,John Granger,5.0
3,826414346,Dr. Seuss: American Icon,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",4.0
4,826414346,Dr. Seuss: American Icon,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",4.0


# ***Case 2 : Drop data buku dengan missing value pada kolom Title & Categories***

# **Data Buku**

In [22]:
# Menghapus rows di mana categories atau title bernilai NaN
cleaned_books = books_df[~(books_df["Title"].isna() | books_df["categories"].isna())]

# Menampilkan DataFrame hasil
print(cleaned_books)

                                                    Title  \
0                          Its Only Art If Its Well Hung!   
1                                Dr. Seuss: American Icon   
2                   Wonderful Worship in Smaller Churches   
3                           Whispers of the Wicked Saints   
5       The Church of Christ: A Biblical Ecclesiology ...   
...                                                   ...   
212397  The Magic of the Soul: Applying Spiritual Powe...   
212398               Autodesk Inventor 10 Essentials Plus   
212399  The Orphan Of Ellis Island (Time Travel Advent...   
212400                            Red Boots for Christmas   
212402                                  The Autograph Man   

                                              description  \
0                                                     NaN   
1       Philip Nel takes a fascinating look into the k...   
2       This resource includes twelve principles in un...   
3       Julia Thomas fi

In [23]:
# Mengganti semua NaN dengan 'unknown'
books_filled = cleaned_books.fillna("unknown")

# Menampilkan DataFrame hasil
print(books_filled)

                                                    Title  \
0                          Its Only Art If Its Well Hung!   
1                                Dr. Seuss: American Icon   
2                   Wonderful Worship in Smaller Churches   
3                           Whispers of the Wicked Saints   
5       The Church of Christ: A Biblical Ecclesiology ...   
...                                                   ...   
212397  The Magic of the Soul: Applying Spiritual Powe...   
212398               Autodesk Inventor 10 Essentials Plus   
212399  The Orphan Of Ellis Island (Time Travel Advent...   
212400                            Red Boots for Christmas   
212402                                  The Autograph Man   

                                              description  \
0                                                 unknown   
1       Philip Nel takes a fascinating look into the k...   
2       This resource includes twelve principles in un...   
3       Julia Thomas fi

In [24]:
# Tampilkan hasil
print(books_filled)

                                                    Title  \
0                          Its Only Art If Its Well Hung!   
1                                Dr. Seuss: American Icon   
2                   Wonderful Worship in Smaller Churches   
3                           Whispers of the Wicked Saints   
5       The Church of Christ: A Biblical Ecclesiology ...   
...                                                   ...   
212397  The Magic of the Soul: Applying Spiritual Powe...   
212398               Autodesk Inventor 10 Essentials Plus   
212399  The Orphan Of Ellis Island (Time Travel Advent...   
212400                            Red Boots for Christmas   
212402                                  The Autograph Man   

                                              description  \
0                                                 unknown   
1       Philip Nel takes a fascinating look into the k...   
2       This resource includes twelve principles in un...   
3       Julia Thomas fi

In [25]:
# Mengekspor ke CSV
books_filled.to_csv("books_filled.csv", index=False)
print("DataFrame telah diekspor ke 'books_filled.csv'")

DataFrame telah diekspor ke 'books_filled.csv'


# **Data rating**

In [26]:
# Menghapus rows di mana Title atau review/score bernilai NaN
cleaned_ratings = ratings_df[~(ratings_df["Title"].isna() | ratings_df["review/score"].isna())]

# Menampilkan DataFrame hasil
print(cleaned_ratings)

                 Id                           Title         User_id  \
0        1882931173  Its Only Art If Its Well Hung!   AVCGYZL8FQQTD   
1        0826414346        Dr. Seuss: American Icon  A30TK6U7DNS82R   
2        0826414346        Dr. Seuss: American Icon  A3UH4UZ4RSVO82   
3        0826414346        Dr. Seuss: American Icon  A2MVUWT453QH61   
4        0826414346        Dr. Seuss: American Icon  A22X4XUPKF66MR   
...             ...                             ...             ...   
2999995  B000NSLVCU             The Idea of History             NaN   
2999996  B000NSLVCU             The Idea of History  A1SMUB9ASL5L9Y   
2999997  B000NSLVCU             The Idea of History  A2AQMEKZKK5EE4   
2999998  B000NSLVCU             The Idea of History  A18SQGYBKS852K   
2999999  B000NSLVCU             The Idea of History             NaN   

                                profileName  review/score  
0                     Jim of Oz "jim-of-oz"           4.0  
1                          

Data Ratings yang digunakan hanya berupa kolom Title dan review/score sehingga kedua kolom ini tidak boleh NaN

# **Categories**

In [27]:
# Menampilkan isi kolom categories dalam DataFrame books_filled
books_filled['categories']

Unnamed: 0,categories
0,['Comics & Graphic Novels']
1,['Biography & Autobiography']
2,['Religion']
3,['Fiction']
5,['Religion']
...,...
212397,"['Body, Mind & Spirit']"
212398,['Computers']
212399,['Juvenile Fiction']
212400,['Juvenile Fiction']


In [28]:
# Menampilkan frekuensi dan jumlah jenis kategori
category_counts = books_filled['categories'].value_counts()
category_counts

Unnamed: 0_level_0,count
categories,Unnamed: 1_level_1
['Fiction'],23419
['Religion'],9459
['History'],9330
['Juvenile Fiction'],6643
['Biography & Autobiography'],6324
...,...
['Oxford (England)'],1
['Single mothres'],1
['Romantic suspense fiction'],1
['Oyster culture'],1


In [29]:
# Generalisasi menjadi 10 category umum
def generalize_categories(category_string):
    if pd.isna(category_string):
        return "Unknown"  # Handle missing values

    category_string = str(category_string).lower()  # Ubah ke lowercase

    # Buat pemetaan category umum
    category_mapping = {
        "fiction": ["fiction", "novel", "fantasy", "scifi", "romance", "thriller", "mystery"],
        "non-fiction": ["biography", "history", "science", "self-help", "business", "cookbooks", "travel"],
        "children's": ["children", "kids", "juvenile"],
        "education": ["education", "textbook", "academic"],
        "religion": ["religion", "spirituality"],
        "comics": ["comics", "manga", "graphic novels"],
        "art": ["art", "design", "photography"],
        "health": ["health", "fitness", "medicine"],
        "technology": ["technology", "computers", "programming"],
        "other": [] # Untuk lainnya
    }

    for general_category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in category_string:
                return general_category
    return "other" # Return "other" jika tidak terdapat keyword yang cocok

# Terapkan fungsi generalisasi ke kolom 'categories'
books_filled['general_category'] = books_filled['categories'].apply(generalize_categories)

# Menampilkan DataFrame yang telah diperbarui
print(books_filled[['categories', 'general_category']].head(20))

                       categories general_category
0     ['Comics & Graphic Novels']          fiction
1   ['Biography & Autobiography']      non-fiction
2                    ['Religion']         religion
3                     ['Fiction']          fiction
5                    ['Religion']         religion
8   ['Biography & Autobiography']      non-fiction
9              ['Social Science']      non-fiction
10                   ['Religion']         religion
11                  ['Reference']            other
12        ['Juvenile Nonfiction']          fiction
13   ['Technology & Engineering']       technology
14                    ['History']      non-fiction
15        ['New Zealand fiction']          fiction
16                        ['Law']            other
17          ['Political Science']      non-fiction
18           ['Health & Fitness']           health
19                    ['Cooking']            other
20                 ['Philosophy']            other
21                     ['Europe

In [30]:
# Menampilkan jumlah isi kolom general_category
books_filled['general_category']

Unnamed: 0,general_category
0,fiction
1,non-fiction
2,religion
3,fiction
5,religion
...,...
212397,other
212398,technology
212399,fiction
212400,fiction


In [31]:
# Menampilkan jumlah category setelah digeneralisasi
unique_categories = books_filled['general_category'].unique()
num_unique_categories = len(unique_categories)
num_unique_categories

10

In [32]:
# Menampilkan isi dan jumlah frekuensi tiap general_category
category_counts = books_filled['general_category'].value_counts()
category_counts

Unnamed: 0_level_0,count
general_category,Unnamed: 1_level_1
other,71276
fiction,37063
non-fiction,33891
religion,9530
art,7286
technology,6068
education,2861
health,2257
children's,965
comics,8


In [33]:
# Terapkan fungsi generalisasi ke kolom 'categories'
books_filled['general_category'] = books_filled['categories'].apply(generalize_categories)

# Drop kolom categories pada dataset
books_filled = books_filled.drop(columns=['categories'])

# Tampilkan DataFrame yang telah diperbarui
books_filled.head()

Unnamed: 0,Title,description,authors,image,publisher,publishedDate,general_category
0,Its Only Art If Its Well Hung!,unknown,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,unknown,1996,fiction
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,unknown,2000,religion
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,iUniverse,2005-02,fiction
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],http://books.google.com/books/content?id=kVqRa...,Wm. B. Eerdmans Publishing,1996,religion


# **Merge Dataset**

In [34]:
# Gabungkan kedua dataset
merged_df = pd.merge(books_filled, cleaned_ratings, on='Title', how='inner')

# Tampilkan DataFrame yang telah digabungkan
merged_df.head(10)

Unnamed: 0,Title,description,authors,image,publisher,publishedDate,general_category,Id,User_id,profileName,review/score
0,Its Only Art If Its Well Hung!,unknown,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,unknown,1996,fiction,1882931173,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",4.0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A30TK6U7DNS82R,Kevin Killian,5.0
2,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A3UH4UZ4RSVO82,John Granger,5.0
3,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",4.0
4,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",4.0
5,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2F6NONFUDB6UK,Malvin,4.0
6,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A14OJS0VWMOSWO,Midwest Book Review,5.0
7,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2RSSXTDZDUSH4,J. Squire,5.0
8,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A25MD5I2GUIW6W,"J. P. HIGBED ""big fellow""",5.0
9,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A3VA4XFS5WNJO3,Donald Burnside,4.0


In [35]:
# Cek apakah ada duplikasi dalam dataset berdasarkan seluruh kolom
duplicates = merged_df.duplicated()  # Menampilkan True jika ada duplikasi
print(f"\nJumlah duplikasi: {duplicates.sum()}")


Jumlah duplikasi: 371623


In [36]:
# Menghapus duplikasi
merged_df = merged_df.drop_duplicates()


In [37]:
# Ekspor DataFrame yang telah digabungkan ke file CSV (opsional)
merged_df.to_csv("merged_books_ratings.csv", index=False, sep=',', encoding='utf-8')
print("Merged DataFrame diekspor ke 'merged_books_ratings.csv'")

Merged DataFrame diekspor ke 'merged_books_ratings.csv'


In [38]:
# Load merged dataset
merged_df = pd.read_csv("merged_books_ratings.csv")

In [39]:
# Tampilkan DataFrame yang telah digabungkan
merged_df.head(10)

Unnamed: 0,Title,description,authors,image,publisher,publishedDate,general_category,Id,User_id,profileName,review/score
0,Its Only Art If Its Well Hung!,unknown,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,unknown,1996,fiction,1882931173,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",4.0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A30TK6U7DNS82R,Kevin Killian,5.0
2,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A3UH4UZ4RSVO82,John Granger,5.0
3,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",4.0
4,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",4.0
5,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2F6NONFUDB6UK,Malvin,4.0
6,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A14OJS0VWMOSWO,Midwest Book Review,5.0
7,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2RSSXTDZDUSH4,J. Squire,5.0
8,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A25MD5I2GUIW6W,"J. P. HIGBED ""big fellow""",5.0
9,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A3VA4XFS5WNJO3,Donald Burnside,4.0


# **Rating Data Filter**

In [40]:
# Tentukan batasan rating sebagai target rekomendasi (misal >= 4)
filtered_df = merged_df[merged_df['review/score'] >= 4]
# Menampilkan hasil DataFrame filtered_df teratas
filtered_df.head()

Unnamed: 0,Title,description,authors,image,publisher,publishedDate,general_category,Id,User_id,profileName,review/score
0,Its Only Art If Its Well Hung!,unknown,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,unknown,1996,fiction,1882931173,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",4.0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A30TK6U7DNS82R,Kevin Killian,5.0
2,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A3UH4UZ4RSVO82,John Granger,5.0
3,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",4.0
4,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,A&C Black,2005-01-01,non-fiction,826414346,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",4.0


# **User-item Matrix**

In [41]:
print(filtered_df.dtypes)

Title                object
description          object
authors              object
image                object
publisher            object
publishedDate        object
general_category     object
Id                   object
User_id              object
profileName          object
review/score        float64
dtype: object


In [42]:
# Menggabungkan rating untuk buku dengan judul yang sama menggunakan groupby dan agg
filtered_df_grouped = filtered_df.groupby('Title')['review/score'].agg('mean')

# Menampilkan DataFrame yang telah digabungkan
filtered_df_grouped

Unnamed: 0_level_0,review/score
Title,Unnamed: 1_level_1
""" Film technique, "" and, "" Film acting """,4.500000
""" We'll Always Have Paris"": The Definitive Guide to Great Lines from the Movies",5.000000
"""... And Poetry is Born ..."" Russian Classical Poetry",4.000000
"""A Truthful Impression of the Country"": British and American Travel Writing in China, 1880-1949",4.000000
"""A careless word, a needless sinking"": A history of the staggering losses suffered by the U.S. Merchant Marine, both in ships and personnel during World War II",5.000000
...,...
william the vehicle king,5.000000
with an everlasting love,4.947368
work and Motivation,5.000000
"xBase Programming for the True Beginner: An Introduction to the xBase Language in the Context of dBASE III+, IV, 5, FoxPro, and Clipper",5.000000


In [43]:
# Mengelompokkan data berdasarkan judul dan menghitung rata-rata rating
book_ratings = filtered_df.groupby('Title')['review/score'].mean().reset_index()

# Membuat user-item matrix
book_matrix = book_ratings.pivot_table(index='Title', values='review/score')

# Mengisi nilai NaN dengan 0 (asumsi rating 0 jika tidak ada rating)
book_matrix = book_matrix.fillna(0)

In [44]:
# Menghitung Similarity Antar Item
from sklearn.metrics.pairwise import cosine_similarity

# Mengubah book_matrix menjadi matriks NumPy
book_matrix_values = book_matrix.values

# Menghitung cosine similarity
item_similarity = cosine_similarity(book_matrix_values)

# Membuat DataFrame dari hasil similarity
item_similarity_df = pd.DataFrame(item_similarity, index=book_matrix.index, columns=book_matrix.index)

In [45]:
# Membuat rekomendasi berdasarkan similarity antar item
def recommend_books(book_title, top_n=5):

  # Mendapatkan similarity buku yang diberikan dengan buku lainnya
  similar_books = item_similarity_df[book_title].sort_values(ascending=False)

  # Menghapus buku yang diberikan dari daftar rekomendasi
  similar_books = similar_books.drop(book_title)

  # Mengambil top_n buku yang paling mirip
  recommended_books = similar_books.head(top_n).index.tolist()

  return recommended_books

# Contoh
recommendations = recommend_books('The Autograph Man', top_n=3)
print(recommendations)

['you can do anything with crepes', '" Film technique, " and, " Film acting "', '" We\'ll Always Have Paris": The Definitive Guide to Great Lines from the Movies']


In [46]:
!pip install scikit-surprise

import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357284 sha256=5a2a251b9a869251dfec7cba02cd46734692f2eab9ae89cd608f85b0a1b73c73
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [47]:
# Definisikan format data rating
reader = Reader(rating_scale=(1, 5))

In [48]:
# Buat dataset surprise dari DataFrame
data = Dataset.load_from_df(merged_df[['User_id', 'Title', 'review/score']], reader)

In [49]:
# Bagi data menjadi data training dan testing
trainset, testset = train_test_split(data, test_size=.25)

In [50]:
# Inisialisasi dan latih model SVD
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c2bf67c0820>

In [51]:
from surprise import accuracy

# Dapatkan prediksi untuk data testing
predictions = algo.test(testset)

# Hitung untuk mengukur akurasi model rekomendasi
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

RMSE: 1.0111
MAE:  0.7292
RMSE: 1.0111403030201367
MAE: 0.7291796026464344


In [52]:
def recommend_books_svd(user_id, top_n=5):

    # Dapatkan semua buku yang belum diberi rating oleh user
    rated_books = merged_df[merged_df['Title'] == user_id]['Title'].tolist()
    unrated_books = [book for book in merged_df['Title'].unique() if book not in rated_books]

    # Prediksi rating untuk buku yang belum diberi rating
    predictions = [algo.predict(user_id, book) for book in unrated_books]

    # Urutkan prediksi berdasarkan estimasi rating
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]

    # Dapatkan judul buku dari prediksi teratas
    recommended_books = [pred.iid for pred in top_predictions]

    return recommended_books

# Contoh penggunaan:
user_id = '1984'  # Ganti dengan ID user atau judul buku yang ingin direkomendasikan
recommendations = recommend_books_svd(user_id, top_n=5)
print(f"Rekomendasi buku untuk user {user_id}: {recommendations}")

Rekomendasi buku untuk user 1984: ['Lilla Belle: The First Stages', "the lion's paw", 'Shadow castle,', 'Why revival tarries,', 'There Are Survivors: The Michael Cuccione Story']


In [53]:
from surprise import accuracy
from collections import defaultdict

def get_top_n(predictions, n=10):

    # Memetakan prediksi ke setiap user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Mengurutkan prediksi untuk setiap user dan mengambil top-N.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Kemudian, setelah Anda melatih model dan mendapatkan prediksi:
top_n = get_top_n(predictions, n=10)

# Anda dapat menghitung precision@k dan recall@k:
def precision_recall_at_k(predictions, k=10, threshold=3.5):

    # Pertama memetakan prediksi ke setiap user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Mengurutkan prediksi user berdasarkan estimasi rating.
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Jumlah item yang relevan
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Jumlah item yang direkomendasikan dalam top-k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Jumlah item yang relevan dan direkomendasikan dalam top-k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # Ketika n_rec_k adalah 0, precision adalah 1.
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        # Ketika n_rel adalah 0, recall adalah 1.
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

# Kemudian Anda dapat menghitung precision dan recall rata-rata
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))

0.9076242708563661
0.8610142836436933


# **Neural Network for Content Based Filtering**

In [54]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [55]:
# Unduh stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [56]:
# Dataset buku
df = book_ratings

In [57]:
# Ambil kolom yang diperlukan: 'Title' dan 'review/score'
data = df[['Title', 'review/score']]

In [58]:
# Preprocessing teks
stop_words = set(stopwords.words('english'))

In [59]:
def preprocess_text(text):
    # Tokenisasi, lowercase, dan hapus stopwords
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [60]:
# Terapkan preprocessing pada kolom Title
data['cleaned_title'] = data['Title'].apply(preprocess_text)


In [61]:
# Tokenisasi judul buku
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['cleaned_title'])
title_sequences = tokenizer.texts_to_sequences(data['cleaned_title'])


In [62]:
# Padding agar semua sequence memiliki panjang yang sama
max_sequence_length = max([len(seq) for seq in title_sequences])
X = pad_sequences(title_sequences, maxlen=max_sequence_length)

In [63]:
# Label rating
y = data['review/score'].values


In [64]:
# Split data untuk pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
print("Data siap untuk model NLP!")

Data siap untuk model NLP!


In [66]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Input
from tensorflow.keras.optimizers import Adam

In [67]:
# Model dengan layer embedding untuk memproses teks
model = Sequential([
    Input(shape=(max_sequence_length,)),  # Input sesuai panjang sequence setelah padding
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=max_sequence_length),
    Flatten(),  # Menyamaratakan output embedding
    Dense(128, activation='relu'),
    Dense(1, activation='linear')  # Output rating sebagai angka kontinu
])


In [68]:
# Menyusun model
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping untuk berhenti jika validasi loss tidak membaik setelah beberapa epoch
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3,  # Berapa epoch tanpa perbaikan yang akan ditunggu
                               restore_best_weights=True)  # Mengembalikan bobot terbaik setelah berhenti

# Melatih model dengan early stopping
model.fit(X_train, y_train,
          epochs=1000,  # menetapkan jumlah epoch yang sangat besar atau tanpa batas
          batch_size=32,
          validation_data=(X_test, y_test),
          callbacks=[early_stopping])


Epoch 1/1000
Epoch 2/1000

In [None]:
# Evaluasi model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

In [None]:
embedding_model = tf.keras.Model(inputs=model.input, outputs=model.layers[0].output)  # Output layer embedding
book_embeddings = embedding_model.predict(X)  # X adalah data input untuk model


In [None]:
# Meratakan seluruh embeddings menjadi 2D
book_embeddings_flattened = book_embeddings.reshape(book_embeddings.shape[0], -1)

# Verifikasi bentuk setelah diratakan
print(book_embeddings_flattened.shape)

# Menghitung kemiripan cosine antar buku
cosine_sim = cosine_similarity(book_embeddings_flattened)

# Menampilkan kemiripan antar buku
print("Cosine Similarity Matrix:")
print(cosine_sim)


**Menampilkan 5 Buku Teratas Berdasarkan Kemiripan dan Rating Tertinggi**

In [None]:
import numpy as np

def get_top_recommendations(book_title, df, cosine_sim, top_n=5):
    # Cek apakah judul buku ada di dataset
    if book_title not in df['Title'].values:
        print(f"Buku dengan judul '{book_title}' tidak ditemukan!")
        return []

    # Menemukan indeks buku berdasarkan judul
    book_idx = df[df['Title'] == book_title].index[0]

    # Verifikasi ukuran cosine_sim
    print(f"Jumlah buku di dataset: {len(df)}")
    print(f"Jumlah baris di cosine_sim: {cosine_sim.shape[0]}")

    # Jika jumlah buku di df tidak sesuai dengan jumlah dalam cosine_sim
    if len(df) != cosine_sim.shape[0]:
        print("Ukuran cosine_sim tidak sesuai dengan dataset!")
        return []

    # Mendapatkan nilai kemiripan untuk buku tersebut
    similarity_scores = cosine_sim[book_idx]

    # Mengurutkan kemiripan dan mengambil indeks dan skor tertinggi
    similar_books_idx = similarity_scores.argsort()[-top_n-1:-1][::-1]  # Mengambil top N rekomendasi (dengan exclude diri sendiri)

    # Menyaring indeks agar tidak termasuk buku itu sendiri
    similar_books_idx = [idx for idx in similar_books_idx if idx != book_idx]

    # Mengambil buku-buku yang relevan
    recommended_books = df.iloc[similar_books_idx]

    # Mengembalikan rekomendasi dalam format yang mudah dibaca
    recommendations = []
    for idx, row in recommended_books.iterrows():
        recommendations.append({
            'Title': row['Title'],
            'Similarity Score': similarity_scores[idx]
        })

    return recommendations

# Contoh pemanggilan fungsi untuk buku dengan judul tertentu
book_title = 'The Art of the Novel'  # Ganti dengan judul buku yang ada di dataset Anda
recommendations = get_top_recommendations(book_title, df, cosine_sim, top_n=5)

if recommendations:  # Perbaiki di sini: tambahkan tanda titik dua (:)
    for rec in recommendations:
        print(f"Buku: {rec['Title']}, Kemiripan: {rec['Similarity Score']}")
else:
    print("Tidak ada rekomendasi buku.")


In [None]:
# Contoh: Menampilkan 5 rekomendasi berdasarkan kemiripan untuk buku dengan title tertentu
book_title = 'Its Only Art If Its Well Hung!'  # Ganti dengan judul buku yang ada di dataset Anda
recommendations = get_top_recommendations(book_title, df, cosine_sim, top_n=5)
print(recommendations)

# Mengonversi Model TensorFlow ke TensorFlow Lite

**Simpan Model TensorFlow**

In [None]:
# Menyimpan model TensorFlow
model.save('book_recommendation_model.h5')

**Mengonversi Model ke Format TensorFlow Lite**

In [None]:
import tensorflow as tf

# Mengonversi model yang disimpan (.h5) ke format .tflite
model = tf.keras.models.load_model('book_recommendation_model.h5')

# Membuat converter TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Mengonversi model ke format TFLite
tflite_model = converter.convert()

# Menyimpan model TFLite
with open('book_recommendation_model.tflite', 'wb') as f:
    f.write(tflite_model)


#Neural Network for Colaborative Filtering

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

In [None]:
print("Jumlah baris sebelum menghapus duplikasi:", len(ratings_df_complete))

# Hapus duplikasi berdasarkan semua kolom
rating = ratings_df_complete.drop_duplicates()

# Periksa jumlah baris setelah menghapus duplikasi
print("Jumlah baris setelah menghapus duplikasi:", len(rating))

In [None]:
# mennggunakan 20k darii dataframe rating_df_complete untuk melatih model nya
rating = ratings_df_complete[:20000]
rating.shape

In [None]:
rating.head()

In [None]:
#Mendapatkan daftar ID unik pengguna dari User_id
user_ids = rating['User_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

In [None]:
#Mendapatkan daftar ID unik pengguna dari Id
book_ids = rating['Id'].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}

In [None]:
# Menambahkan kolom baru user dan book yang berisi encoding numerik dari User_id dan Id
rating['user'] = rating['User_id'].map(user2user_encoded)
rating['book'] = rating['Id'].map(book2book_encoded)

In [None]:
# mendapatkan lenght dari user dan books yang sudah di encoded
num_users = len(user2user_encoded)
num_books = len(book2book_encoded)
# mengubah type data reviw/score ke float32
rating["review/score"] = rating["review/score"].values.astype(np.float32)
# mendapatkan nilai min dan maks di review/score
min_rating = min(rating["review/score"])
max_rating = max(rating["review/score"])
print(
    "number of users: {}, books: {}, min rating: {}, max rating: {}".format(
        num_users, num_books, min_rating, max_rating
    )
)


In [None]:
# mengacak urutan data dalam df rating
rating= rating.sample(frac=1, random_state=42)
x = rating[["user", "book"]].values
# Menormalkan skor review menjadi nilai antara 0 dan 1
y = rating["review/score"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# Membagi data menjadi set pelatihan (90%) dan set validasi (10%)
train_indices = int(0.9 * rating.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)
print(x, y)

In [None]:
class RecommenderNet(tf.keras.Model):
  def __init__(self, num_users, num_books, embedding_size, **kwargs):
    #inisialisasi class
    super(RecommenderNet, self).__init__(**kwargs)
    self.num_users = num_users
    self.embedding_size = embedding_size
    self.num_books = num_books
    # buat layer embbeding untuk user
    self.embedding_size = embedding_size
    self.user_embedding = layers.Embedding(
        num_users,
        embedding_size,
        embeddings_initializer = 'he_normal',
        embeddings_regularizer = keras.regularizers.l2(1e-8)
    )
    # buat layer bias untuk user
    self.user_bias = layers.Embedding(num_users, 1)
    # buat layer embedding untuk book
    self.book_embedding = layers.Embedding(
        num_books,
        embedding_size,
        embeddings_initializer = 'he_normal',
        embeddings_regularizer = keras.regularizers.l2(1e-8)
    )
    # buat layer bias untuk book
    self.book_bias = layers.Embedding(num_books, 1)

  def call(self, inputs):
    user_vector = self.user_embedding(inputs[:,0])
    user_bias = self.user_bias(inputs[:, 0])
    book_vector = self.book_embedding(inputs[:, 1])
    book_bias = self.book_bias(inputs[:, 1])

    # menghitung hasil perkalian titik antara user dan book
    dot_user_book = tf.tensordot(user_vector, book_vector, 2)
    # menambahkan bias ke hasil dot user dan book
    x = dot_user_book + user_bias + book_bias
    # menggunakan fungsi sigmoid karena output nya probabilitas
    return tf.nn.sigmoid(x)

In [None]:

# implementing callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau

lr_reduction = ReduceLROnPlateau(
    monitor = 'root_mean_squared_error',
    patience = 3,
    verbose = 1,
    factor = 0.1,
    min_lr = 0.000001
)

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    rmse = logs.get('root_mean_squared_error')
    val_rmse = logs.get('val_root_mean_squared_error')

    if(rmse < 0.2):
      print("\nMencapai akurasi yang diinginkan sehingga membatalkan pelatihan!")
      self.model.stop_training = True

earlyStop = myCallback()

In [None]:

# model initialization
model_CF= RecommenderNet(num_users, num_books, 50)

# model compiling
model_CF.compile(
    loss = 'binary_crossentropy',
    optimizer = tf.optimizers.Adam(learning_rate = 0.001),
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:

# model training
history = model_CF.fit(
    x = x_train,
    y = y_train,
    batch_size = 8,
    epochs = 40,
    verbose = 1,
    validation_data = (x_val, y_val),
    callbacks = [lr_reduction, earlyStop]
)

In [None]:
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('model error')
plt.ylabel('root_mean_squared_error')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:


# Pilih user_id secara acak dari dataset rating
user_id = rating['User_id'].sample(1).iloc[0]

# Ambil data buku yang sudah dibaca oleh user
book_read = rating[rating.User_id == user_id]

# Dapatkan daftar buku yang belum dibaca oleh user
book_not_read = rating[~rating['Id'].isin(book_read.Id.values)]['Id']
book_not_read = list(
    set(book_not_read).intersection(set(book2book_encoded.keys()))
)

# Encode daftar buku yang belum dibaca
book_not_read = [
    [book2book_encoded.get(x)] for x in book_not_read
]

# Encode user_id
user_encoder = user2user_encoded.get(user_id)

# Buat array input (user, book)
user_book_array = np.hstack(
    ([[user_encoder]] * len(book_not_read), book_not_read)
)

# Prediksi rating menggunakan model
ratings = model_CF.predict(user_book_array).flatten()

## Ambil indeks dengan prediksi tertinggi
top_ratings_indices = ratings.argsort()[-10:][::-1]

# Konversi kembali ke book_id berdasarkan hasil prediksi
recommended_book_ids = [
    book_encoded2book.get(book_not_read[x][0]) for x in top_ratings_indices
]
# Pastikan hanya ada 10 buku tanpa duplikat
recommended_book_ids = list(set(recommended_book_ids))[:10]

# Tampilkan hasil
print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Books with high ratings from user")
print("----" * 8)

# Buku yang memiliki skor tinggi dari user
top_books_user = (
    book_read.sort_values(by="review/score", ascending=False)
    .head(5)
    .Id.values
)

# Tampilkan judul buku yang sudah dibaca
seen_titles = set()  # Set untuk melacak buku yang sudah ditampilkan
for row in book_read[book_read["Id"].isin(top_books_user)].itertuples():
    if row.Title not in seen_titles:  # Hanya tampilkan jika belum pernah ditampilkan
        print(row.Title)
        seen_titles.add(row.Title)

print("-----" * 8)
print("Top 10 books recommendation")
print("-----" * 8)

# Tampilkan judul dari rekomendasi
recommended_books = (
    rating[rating["Id"].isin(recommended_book_ids)]
    .drop_duplicates(subset=["Id"])  # Hilangkan duplikasi berdasarkan Id
)
for row in recommended_books.itertuples():
    if row.Title not in seen_titles:  # Hanya tampilkan jika belum pernah ditampilkan
        print(row.Title)
        seen_titles.add(row.Title)


**Simpan Model collaborative filltering ke Tensorflow**

In [None]:
# Menyimpan model dalam format SavedModel
model_CF.save('rekomendasi_buku_CF', save_format='tf')

In [None]:
# Menyimpan bobot ke format HDF5
model_CF.save_weights('rekomendasi_buku_CF_weights.h5')

In [None]:
# Inisialisasi model
model_CF = RecommenderNet(num_users, num_books, 50)

# Gunakan metode build untuk membangun model
model_CF.build(input_shape=(None, 2))  # Bentuk input sesuai (batch size, fitur)

# Muat bobot ke model
model_CF.load_weights('rekomendasi_buku_CF_weights.h5')


In [None]:
# Simpan ulang bobot model
model_CF.save_weights('rekomendasi_buku_CF_weights.h5')


In [None]:
# Validasi bobot
print(model_CF.variables)


In [None]:
# Contoh input data untuk prediksi (user_id, book_id)
new_input = tf.constant([[9382, 1045]])  # Sesuaikan dengan pasangan user dan book

# Prediksi output
predicted_score = model_CF.predict(new_input)
print("Prediksi skor rekomendasi:", predicted_score)


**Mengonversi Model ke Format TensorFlow Lite**

In [None]:
import tensorflow as tf

# Mengonversi model yang disimpan (.h5) ke format .tflite
model = tf.keras.models.load_model('book_recommendation_model.h5')

# Membuat converter TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Mengonversi model ke format TFLite
tflite_model = converter.convert()

# Menyimpan model TFLite
with open('book_recommendation_model.tflite', 'wb') as f:
    f.write(tflite_model)
