# 도서 추천 시스템
1. 인구통계학적 필터링
2. 콘텐츠 기반 필터링
3. 협업 필터링

데이터 출처: https://www.kaggle.com/datasets/ruchi798/bookcrossing-dataset

### 데이터 불러오기

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/recommendation_system/"

In [None]:
books = pd.read_csv(path + 'Books.csv', sep=';', encoding='latin-1',low_memory=False)
ratings = pd.read_csv(path + 'Ratings.csv', sep=';', encoding='latin-1',low_memory=False)

print(f"Books Dataset: {books.shape}")
print(f"Ratings Dataset: {ratings.shape}")
# print(f'Users Dataset: {users.shape}')

Books Dataset: (271379, 8)
Ratings Dataset: (1149780, 3)


### Books Dataset

In [None]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [None]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-M']]

In [None]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271379 entries, 0 to 271378
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271379 non-null  object
 1   Book-Title           271379 non-null  object
 2   Book-Author          271378 non-null  object
 3   Year-Of-Publication  271379 non-null  int64 
 4   Publisher            271377 non-null  object
 5   Image-URL-M          271379 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.4+ MB


In [None]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-M            0
dtype: int64

In [None]:
# 결측치 제거
books = books.dropna()

books.shape

(271376, 6)

In [None]:
np.sort(books['Year-Of-Publication'].unique())

array([   0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908,
       1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924,
       1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935,
       1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946,
       1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957,
       1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968,
       1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979,
       1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021,
       2024, 2026, 2030, 2037, 2038, 2050])

In [None]:
# temp = books.loc[(books['Year-Of-Publication'] >= 1960) & (books['Year-Of-Publication'] < 2020), 'Year-Of-Publication']
# sns.boxplot(temp)

In [None]:
# 출판년도 이상치 제거
drop_year = books[(books['Year-Of-Publication'] == 0) | (books['Year-Of-Publication'] > 2021)].index
books = books.drop(drop_year).reset_index(drop=True)
books.shape

(266744, 6)

In [None]:
# 특이한 책 제목 삭제
drop_index = books[(books['Book-Title'].str.contains('Ã')) | (books['Book-Title'].str.contains('Â'))].index

books = books.drop(drop_index).reset_index(drop=True)
books.shape

(260114, 6)

In [None]:
# 중복 값 삭제
books = books.drop_duplicates('Book-Title')
books.shape

(232079, 6)

In [None]:
books['ISBN'].nunique()

232079

In [None]:
# books.to_csv('/content/drive/MyDrive/Colab Notebooks/recommendation_system/books.csv', index=False)

### Ratings Dataset

In [None]:
ratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [None]:
np.sort(ratings['Book-Rating'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [None]:
# 결측치 확인
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [None]:
# 중복값 확인(User-ID와 ISBN)
ratings.duplicated(['User-ID', 'ISBN']).sum()

0

In [None]:
ratings['ISBN'].nunique()

340556

In [None]:
# ratings.to_csv('/content/drive/MyDrive/Colab Notebooks/recommendation_system/ratings.csv', index=False)

### 데이터 합치기

In [None]:
print(f"Books Dataset: {books.shape}")
print(f"Ratings Dataset: {ratings.shape}")

Books Dataset: (232079, 6)
Ratings Dataset: (1149780, 3)


In [None]:
ratings_books = pd.merge(ratings, books, on='ISBN', how='inner')
ratings_books.shape

(860459, 8)

In [None]:
ratings_books.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...


In [None]:
ratings_books['ISBN'].nunique()

231082

In [None]:
ratings_books['User-ID'].nunique()

81385

In [None]:
rating_books = ratings_books[:100000]
rating_books.shape

(100000, 8)

In [None]:
rating_books.to_csv(path + 'ratings_books.csv', index=False)

In [None]:
# df = pd.read_csv(path + 'ratings_books.csv')
# df.shape

(100000, 8)

### 캐글 코드 참조

In [None]:
# # 도서 별로 별점을 매긴 사용자의 수
# num_rating_df = ratings_books.groupby('Book-Title').count()['Book-Rating'].reset_index()
# num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)
# num_rating_df.head(10)

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
5,Clifford Visita El Hospital (Clifford El Gran...,1
6,Dark Justice,1
7,Deceived,2
8,Earth Prayers From around the World: 365 Pray...,10
9,Final Fantasy Anthology: Official Strategy Gu...,4


In [None]:
# df = num_rating_df[num_rating_df['num_ratings']>=50]
# df = df.merge(ratings_books, on='Book-Title').drop_duplicates('Book-Title')[['User-ID', 'Book-Title','Book-Author','Publisher','num_ratings','Book-Rating','Image-URL-M']].reset_index()

In [None]:
# df.head()

Unnamed: 0,index,User-ID,Book-Title,Book-Author,Publisher,num_ratings,Book-Rating,Image-URL-M
0,0,278188,16 Lighthouse Road,Debbie Macomber,Mira,65,0,http://images.amazon.com/images/P/1551668300.0...
1,65,278158,1984,George Orwell,Signet Book,192,0,http://images.amazon.com/images/P/0451524934.0...
2,257,277170,1st to Die: A Novel,James Patterson,Warner Vision,391,0,http://images.amazon.com/images/P/0446610038.0...
3,648,2276,2010: Odyssey Two,Arthur C. Clarke,Del Rey Books,79,0,http://images.amazon.com/images/P/0345303067.0...
4,727,278188,204 Rosewood Lane,Debbie Macomber,Mira,71,0,http://images.amazon.com/images/P/1551669293.0...


In [None]:
# df['Book-Title'].nunique()

1765

In [None]:
# df['User-ID'].nunique()

640

In [None]:
# df.to_csv(path + 'rating_books.csv', index=False)