<a href="https://colab.research.google.com/github/AbelKristanto/machine-learning/blob/main/class_9_recomended_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this notebook, we will learn about how to make recomended builder in data science.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_book = pd.read_csv('BX-Books.csv', sep=";", error_bad_lines=False, encoding='latin-1')
# Create name columns
df_book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'namePublisher', 'imageURLs', 'imageURLm', 'imageURLl']

b'Skipping line 6451: expected 8 fields, saw 9\nSkipping line 43666: expected 8 fields, saw 10\nSkipping line 51750: expected 8 fields, saw 9\n'
b'Skipping line 92037: expected 8 fields, saw 9\nSkipping line 104318: expected 8 fields, saw 9\nSkipping line 121767: expected 8 fields, saw 9\n'
b'Skipping line 144057: expected 8 fields, saw 9\nSkipping line 150788: expected 8 fields, saw 9\nSkipping line 157127: expected 8 fields, saw 9\nSkipping line 180188: expected 8 fields, saw 9\nSkipping line 185737: expected 8 fields, saw 9\n'
b'Skipping line 209387: expected 8 fields, saw 9\nSkipping line 220625: expected 8 fields, saw 9\nSkipping line 227932: expected 8 fields, saw 11\nSkipping line 228956: expected 8 fields, saw 10\nSkipping line 245932: expected 8 fields, saw 9\nSkipping line 251295: expected 8 fields, saw 9\nSkipping line 259940: expected 8 fields, saw 9\nSkipping line 261528: expected 8 fields, saw 9\n'


In [4]:
df_book.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,namePublisher,imageURLs,imageURLm,imageURLl
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
1,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
3,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
4,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...


In [5]:
df_user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')
df_user.columns = ['userID', 'location', 'age']
df_user.head()

Unnamed: 0,userID,location,age
0,2,"stockton, california, usa",18
1,3,"moscow, yukon territory, russia",\N
2,4,"porto, v.n.gaia, portugal",17
3,5,"farnborough, hants, united kingdom",\N
4,6,"santa monica, california, usa",61


In [6]:
df_ratings = pd.read_csv('BX-Book-Ratings.csv', sep=";", 
                         error_bad_lines=False, encoding='latin-1')
df_ratings.columns = ['userID', 'ISBN', 'bookRating']
df_ratings.head()

Unnamed: 0,userID,ISBN,bookRating
0,276726,0155061224,5
1,276727,0446520802,0
2,276729,052165615X,3
3,276729,0521795028,6
4,276733,2080674722,0


In [7]:
# MERGE Data with another
df_book_ratings = pd.merge(df_ratings, df_book, on='ISBN')
drop_columns = ['namePublisher', 'imageURLs', 
                'imageURLm', 'imageURLl']
df_book_ratings = df_book_ratings.drop(drop_columns, axis=1)
df_book_ratings.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication
0,276726,155061224,5,Rites of Passage,Judith Rae,2001
1,159181,155061224,0,Rites of Passage,Judith Rae,2001
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996
4,638,446520802,0,The Notebook,Nicholas Sparks,1996


In [8]:
# POPULAR Books 
# DROP na dataset
df_book_ratings = df_book_ratings.dropna(axis=0, subset=['bookTitle'])

In [9]:
Top_rating_book = (df_book_ratings.\
                   groupby(by=['bookTitle'])['bookRating'].\
                   count().\
                   reset_index().\
                   rename(columns={'bookRating':'totalRatingCount'})
                   [['bookTitle', 'totalRatingCount']]).sort_values(by=['totalRatingCount'], ascending=False)
Top_rating_book.head()

Unnamed: 0,bookTitle,totalRatingCount
234998,Wild Animus,2502
196392,The Lovely Bones: A Novel,1295
183639,The Da Vinci Code,898
5339,A Painted House,838
199303,The Nanny Diaries: A Novel,828


In [10]:
# Total Rating
ratings_with_totalCount = df_book_ratings.merge(Top_rating_book, left_on='bookTitle',
                                                right_on='bookTitle', how='left').sort_values(by=['totalRatingCount'], ascending=False)
ratings_with_totalCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,totalRatingCount
10047,54154,971880107,0,Wild Animus,Rich Shapero,2004,2502
10617,116993,971880107,0,Wild Animus,Rich Shapero,2004,2502
10615,116341,971880107,0,Wild Animus,Rich Shapero,2004,2502
10614,116325,971880107,0,Wild Animus,Rich Shapero,2004,2502
10613,116210,971880107,0,Wild Animus,Rich Shapero,2004,2502


In [11]:
# aturan setting penampilan
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(Top_rating_book['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [12]:
print(Top_rating_book['totalRatingCount'].quantile(np.arange(0.8,1, .01)))

0.800    4.000
0.810    4.000
0.820    4.000
0.830    4.000
0.840    5.000
0.850    5.000
0.860    5.000
0.870    6.000
0.880    6.000
0.890    6.000
0.900    7.000
0.910    8.000
0.920    9.000
0.930   10.000
0.940   11.000
0.950   13.000
0.960   16.000
0.970   20.000
0.980   29.000
0.990   50.000
Name: totalRatingCount, dtype: float64


In [13]:
# We make popularity threshold
popularity_threshold = 50
rating_popular = ratings_with_totalCount.query('totalRatingCount >= @popularity_threshold')
rating_popular.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,yearOfPublication,totalRatingCount
10047,54154,971880107,0,Wild Animus,Rich Shapero,2004,2502
10617,116993,971880107,0,Wild Animus,Rich Shapero,2004,2502
10615,116341,971880107,0,Wild Animus,Rich Shapero,2004,2502
10614,116325,971880107,0,Wild Animus,Rich Shapero,2004,2502
10613,116210,971880107,0,Wild Animus,Rich Shapero,2004,2502


## NOTED THIS STEP ##

In [23]:
# Filtering with exception criteria

# 1. Create merge dataset
df_user_rating_book = rating_popular.merge(df_user, left_on='userID', right_on='userID', how='left')

# 2. Only select text contains Russia|United Kingdom
russ_uni_user_rating = df_user_rating_book[df_user_rating_book['location'].str.contains('russia|united kingdom')]
russ_uni_user_rating = russ_uni_user_rating.drop(['age', 'yearOfPublication','bookAuthor'], 1)
russ_uni_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,location
6,115904,971880107,0,Wild Animus,2502,"bolton, lancashire, united kingdom"
13,115087,971880107,0,Wild Animus,2502,"rugby, warwickshire, united kingdom"
17,114893,971880107,0,Wild Animus,2502,"harrogate, england, united kingdom"
18,114446,971880107,0,Wild Animus,2502,"cardiff, wales, united kingdom"
22,114288,971880107,5,Wild Animus,2502,"winchester, england, united kingdom"


In [24]:
# DROP Duplicates dataset
if not russ_uni_user_rating[russ_uni_user_rating.duplicated(['userID', 'bookTitle'])].empty:
  initial_rows = russ_uni_user_rating.shape[0]
  print('Initial dataframe shape {}'.format(russ_uni_user_rating.shape))
  russ_uni_user_rating = russ_uni_user_rating.drop_duplicates(['userID', 'bookTitle'])
  current_rows = russ_uni_user_rating.shape[0]
  print('New dataframe shape {}'.format(russ_uni_user_rating.shape))
  print('Number of removed {} rows'.format(initial_rows-current_rows))

Initial dataframe shape (6702, 6)
New dataframe shape (6679, 6)
Number of removed 23 rows


In [25]:
russ_uni_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,location
6,115904,971880107,0,Wild Animus,2502,"bolton, lancashire, united kingdom"
13,115087,971880107,0,Wild Animus,2502,"rugby, warwickshire, united kingdom"
17,114893,971880107,0,Wild Animus,2502,"harrogate, england, united kingdom"
18,114446,971880107,0,Wild Animus,2502,"cardiff, wales, united kingdom"
22,114288,971880107,5,Wild Animus,2502,"winchester, england, united kingdom"


In [32]:
# PIVOT data between userID with bookTitle
russ_uni_user_rating_pivot = russ_uni_user_rating.pivot(index='bookTitle', columns = 'userID', values='bookRating').fillna(0)
russ_uni_user_rating_matrix = csr_matrix(russ_uni_user_rating_pivot.values)

In [34]:
# Create MODEL
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(russ_uni_user_rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [41]:
query_index = np.random.choice(russ_uni_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(russ_uni_user_rating_pivot.iloc[query_index, :].\
                                          values.reshape(1,-1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
  if i == 0:
    print('Recommendations for {0}:\n'.format(russ_uni_user_rating_pivot.index[query_index]))
  else:
    print('{0}:{1}, with distance of {2}'.format(i, russ_uni_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Chasing Cezanne:

1:Tara Road, with distance of 1.0
2:Tailchaser's Song, with distance of 1.0
3:Tales of a Fourth Grade Nothing, with distance of 1.0
4:Sword of Shannara, with distance of 1.0
5:Talking God (Jim Chee Novels), with distance of 1.0


In [42]:
russ_uni_user_rating_pivot2 = russ_uni_user_rating.pivot(index='userID', columns='bookTitle', 
                                                         values='bookRating').fillna(0)
russ_uni_user_rating_pivot2.head()                                                         

bookTitle,10 Lb. Penalty,1984,1st to Die: A Novel,2nd Chance,3rd Degree,4 Blondes,84 Charing Cross Road,A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Clockwork Orange (Norton Paperback Fiction),A Confederacy of Dunces,A Confederacy of Dunces (Evergreen Book),A Cup of Tea (Ballantine Reader's Circle),A Dangerous Fortune,A Darkness More Than Night,A Density of Souls,A Fine Balance,"A Game of Thrones (A Song of Ice and Fire, Book 1)",A Great Deliverance,A Heartbreaking Work Of Staggering Genius : A Memoir Based on a True Story,A Heartbreaking Work of Staggering Genius,A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),A Lesson Before Dying (Vintage Contemporaries (Paperback)),A Little Princess,A Maiden's Grave,A Man Named Dave: A Story of Triumph and Forgiveness,A Man in Full,A Map of the World,A Monk Swimming : A Memoir,A Natural History of the Senses,A New Song (Mitford Years (Paperback)),A Painted House,A Passage to India,A Patchwork Planet,A Patchwork Planet (Ballantine Reader's Circle),A Place of Execution,A Portrait of the Artist As a Young Man,A Prayer for Owen Meany,...,Where You Belong,Where or When : A Novel,Where the Heart Is (Oprah's Book Club (Paperback)),While I Was Gone,While My Pretty One Sleeps,Whirlwind,"Whirlwind (Tyler, Book 1)","Whisper of Evil (Hooper, Kay. Evil Trilogy.)",White Noise (Contemporary American Fiction),White Oleander : A Novel,White Oleander : A Novel (Oprah's Book Club),White Teeth: A Novel,Who Moved My Cheese? An Amazing Way to Deal with Change in Your Work and in Your Life,Wicked: The Life and Times of the Wicked Witch of the West,Wild Animus,Wild Horses,Wild Swans: Three Daughters of China,Windmills of the Gods,Winter Moon,Winter Solstice,Winter's Tale,Wish You Well,Without Remorse,Witness in Death (Eve Dallas Mysteries (Paperback)),"Wizard and Glass (The Dark Tower, Book 4)",Women Who Run with the Wolves,Women in His Life,"Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players",Wouldn't Take Nothing for My Journey Now,Writ of Execution,Wuthering Heights,Wuthering Heights (Penguin Classics),Wuthering Heights (Wordsworth Classics),Year of Wonders,You Shall Know Our Velocity,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
print("Shape of dataset: {0}".format(russ_uni_user_rating_pivot2.shape))

Shape of dataset: (1587, 1502)


In [44]:
# MODELLING 2 step
X = russ_uni_user_rating_pivot2.values.T
X.shape

(1502, 1587)

In [45]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=42)
matrix = SVD.fit_transform(X)
matrix.shape

(1502, 12)

In [46]:
corr = np.corrcoef(matrix)
corr.shape

(1502, 1502)

In [51]:
russ_uni_book_title = russ_uni_user_rating_pivot2.columns
russ_uni_book_list = list(russ_uni_book_title)
wuthering = russ_uni_book_list.index("Wuthering Heights (Wordsworth Classics)")
print(wuthering)

1494


In [53]:
corr_wuthering = corr[wuthering]
list(russ_uni_book_title[(corr_wuthering<1.0) & (corr_wuthering>0.9)])

["Einstein's Dreams",
 'Hunting Badger (Joe Leaphorn/Jim Chee Novels)',
 'Love in the Time of Cholera',
 'The Hours : A Novel',
 'White Oleander : A Novel']

In [None]:
# FINISH