In [104]:
# Import library
import pandas as pd
import numpy as np 
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt

# Data Understanding

In [105]:
books = pd.read_csv("Books.csv")
ratings = pd.read_csv("Ratings.csv")
users = pd.read_csv("Users.csv")

print('Jumlah data buku: ', len(books.ISBN.unique()))
print('Jumlah data pengguna: ', len(users["User-ID"].unique()))
print('Jumlah data penilaian yang diberikan pengguna: ', len(ratings["User-ID"].unique()))
print('Jumlah data penilaian buku: ', len(ratings.ISBN.unique()))

  exec(code_obj, self.user_global_ns, self.user_ns)


Jumlah data buku:  271360
Jumlah data pengguna:  278858
Jumlah data penilaian yang diberikan pengguna:  105283
Jumlah data penilaian buku:  340556


Deskripsi file:
- books : merupakan daftar buku yang ada pada dataset.
- ratings : merupakan rating yang diberikan user kepada buku.
- users : merupakan daftar pengguna yang terdapat pada dataset.

# Exploratory Data Analysis

## Variabel Books

In [106]:
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [107]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [108]:
duplicate_rows_books = books[books.duplicated()]
print("number of duplicate rows: ", duplicate_rows_books.shape)

number of duplicate rows:  (0, 8)


Dari kode diatas, didapatkan informasi bahwa:
- data memiliki missing value, sehingga akan dihapus pada proses berikutnya
- data tidak memiliki nilai duplikat
- Kolom Image-URL baik yang S, M ataupun L tidak akan digunakan pada studi kasus ini sehingga akan dihapus.

In [109]:
books = books.dropna()
books.count()

ISBN                   271354
Book-Title             271354
Book-Author            271354
Year-Of-Publication    271354
Publisher              271354
Image-URL-S            271354
Image-URL-M            271354
Image-URL-L            271354
dtype: int64

In [110]:
books = books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)
books.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


## Variabel Users

In [111]:
users.head(5)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [112]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


Dari kode diatas, didapatkan informasi bahwa terlalu banyak missing value pada kolom age, sehingga akan kita hapus kolomnya saja.

In [113]:
users = users.drop(['Age'], axis=1)
users.head(5)

Unnamed: 0,User-ID,Location
0,1,"nyc, new york, usa"
1,2,"stockton, california, usa"
2,3,"moscow, yukon territory, russia"
3,4,"porto, v.n.gaia, portugal"
4,5,"farnborough, hants, united kingdom"


## Variabel Ratings

In [114]:
ratings.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [115]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [116]:
ratings.describe()

Unnamed: 0,User-ID,Book-Rating
count,1149780.0,1149780.0
mean,140386.4,2.86695
std,80562.28,3.854184
min,2.0,0.0
25%,70345.0,0.0
50%,141010.0,0.0
75%,211028.0,7.0
max,278854.0,10.0


Dari kode diatas, didapatkan informasi bahwa:
- tidak terdapat missing value pada data
- rating yang diberikan oleh user memiliki range 0 - 10. 

# Data Preprocessing

## Menggabungkan data variabel users ke variabel ratings

In [117]:
ratings_users = ratings.merge(users, left_on = 'User-ID', right_on = 'User-ID')
ratings_users

Unnamed: 0,User-ID,ISBN,Book-Rating,Location
0,276725,034545104X,0,"tyler, texas, usa"
1,276726,0155061224,5,"seattle, washington, usa"
2,276727,0446520802,0,"h, new south wales, australia"
3,276729,052165615X,3,"rijeka, n/a, croatia"
4,276729,0521795028,6,"rijeka, n/a, croatia"
...,...,...,...,...
1149775,276704,1563526298,9,"cedar park, texas, usa"
1149776,276706,0679447156,0,"quebec, quebec, canada"
1149777,276709,0515107662,10,"mannington, west virginia, usa"
1149778,276721,0590442449,10,"providence, rhode island, usa"


## Menggabungkan data variabel ratings_users ke variabel books

In [118]:
books_ratings_users = books.merge(ratings_users, left_on = 'ISBN', right_on = 'ISBN')
books_ratings_users

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Location
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,"stockton, california, usa"
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5,"timmins, ontario, canada"
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0,"ottawa, ontario, canada"
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8,"n/a, n/a, n/a"
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0,"sudbury, ontario, canada"
...,...,...,...,...,...,...,...,...
1031124,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7,"providence, rhode island, usa"
1031125,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,276579,4,"orem, utah, usa"
1031126,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,276680,0,"hopewell junction, new york, usa"
1031127,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,276680,0,"hopewell junction, new york, usa"


# Data Preparation

In [119]:
books_ratings_users.count()

ISBN                   1031129
Book-Title             1031129
Book-Author            1031129
Year-Of-Publication    1031129
Publisher              1031129
User-ID                1031129
Book-Rating            1031129
Location               1031129
dtype: int64

Hal selanjutnya yang akan dilakukan yaitu mendefinikasn fitur - fitur yang penting saja yang akan dijadikan dataframe tetap.

In [120]:
books_fix = books_ratings_users[['ISBN', 'Book-Title', 'Book-Author', 'User-ID', 'Book-Rating']]
books_fix

Unnamed: 0,ISBN,Book-Title,Book-Author,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,41385,0
...,...,...,...,...,...
1031124,0440400988,There's a Bat in Bunk Five,Paula Danziger,276463,7
1031125,0525447644,From One to One Hundred,Teri Sloat,276579,4
1031126,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,276680,0
1031127,0192126040,Republic (World's Classics),Plato,276680,0


## Encoding

Hal yang harus dilakukan adalah melakukan encoding untuk fitur ISBN.

In [121]:
ISBN = books_fix.ISBN

# Melakukan encoding ISBN
ISBN_to_ISBN_encoded = {x: i for i, x in enumerate(ISBN)}
print('encoded ISBN : ', ISBN_to_ISBN_encoded)
 
# Melakukan proses encoding angka ke userID
ISBN_encoded_to_ISBN = {i: x for i, x in enumerate(ISBN)}
print('encoded angka ke userID: ', ISBN_encoded_to_ISBN)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

