#### All the Necessary Imports

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module='pandas')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

pd.set_option('mode.chained_assignment',None)

### 1: Read the dataset and explore it
###### Read dataset for Books and Explore
Use the "Latin-1" encoding to map byte values directly to the first 256 Unicode code points.
By specifying the data types of the columns in the "BX-Books.csv" file, handled the mixed data type warning effectively. 

In [2]:
# Reading the BX-Books dataset
books_df = pd.read_csv('BX-Books.csv', encoding="latin-1", dtype = {'isbn': str, 'book_title': str, 'book_author': str, 'year_of_publication': str, 'publisher': str})
books_df

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271374,440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [3]:
# Details of the books dataframe
num = books_df.shape
print("Number of rows:", num[0])
print("Number of columns:", num[1])
print("About the Book Dataframe: ")
books_df.describe()

Number of rows: 271379
Number of columns: 5
About the Book Dataframe: 


Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
count,271379,271379,271378,271379,271377
unique,271379,242150,102042,137,16823
top,195153448,Selected Poems,Agatha Christie,2002,Harlequin
freq,1,27,632,17627,7535


In [4]:
# Checking for Null Values in Books dataframe
books_df.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

##### Read dataset for Users and Explore
Use the "Latin-1" encoding to map byte values directly to the first 256 Unicode code points.

In [5]:
user_df = pd.read_csv('BX-Users.csv', encoding='latin-1', low_memory=False)
user_df

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278854,278854,"portland, oregon, usa",
278855,278855,"tacoma, washington, united kingdom",50.0
278856,278856,"brampton, ontario, canada",
278857,278857,"knoxville, tennessee, usa",


In [6]:
# Details of the users dataframe
num = user_df.shape
print("Number of rows:", num[0])
print("Number of columns:", num[1])
print("About the User Dataframe: ")
user_df.describe()

Number of rows: 278859
Number of columns: 3
About the User Dataframe: 


Unnamed: 0,Age
count,168096.0
mean,34.751434
std,14.428097
min,0.0
25%,24.0
50%,32.0
75%,44.0
max,244.0


In [7]:
# Checking for Null Values in User dataframe
user_df.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

### 2: Clean up NaN values
To clean up NaN values, I have drop all the NaN rows for both users and books dataframe.
##### For Books data

In [8]:
books_df.dropna(inplace=True)
books_df.reset_index(drop=True, inplace=True)
books_df

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271371,440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271372,525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271373,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271374,192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [9]:
books_df.isnull().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
dtype: int64

##### For Users data

In [10]:
# Dropping the Null Values.
user_df.dropna(inplace=True)
user_df.reset_index(drop=True, inplace=True)
user_df

Unnamed: 0,user_id,Location,Age
0,2,"stockton, california, usa",18.0
1,4,"porto, v.n.gaia, portugal",17.0
2,6,"santa monica, california, usa",61.0
3,10,"albacete, wisconsin, spain",26.0
4,11,"melbourne, victoria, australia",14.0
...,...,...,...
168091,278849,"georgetown, ontario, canada",23.0
168092,278851,"dallas, texas, usa",33.0
168093,278852,"brisbane, queensland, australia",32.0
168094,278853,"stranraer, n/a, united kingdom",17.0


In [11]:
user_df.isnull().sum()

user_id     0
Location    0
Age         0
dtype: int64

### 3: Read the data where ratings are given by users

Use the "Latin-1" encoding to prevent memory errors.
Use the describe() function to gain insights into the dataset.

In [12]:
# Reading the BX-Books-Ratings dataset
rating_df = pd.read_csv('BX-Book-Ratings.csv', encoding="latin-1")
rating_df

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6
...,...,...,...
1048570,250764,451410777,0
1048571,250764,452264464,8
1048572,250764,048623715X,0
1048573,250764,486256588,0


In [13]:
# Details of the rating dataframe
num = rating_df.shape
print("Number of rows:", num[0])
print("Number of columns:", num[1])
print("About the rating Dataframe: ")
rating_df.describe()

Number of rows: 1048575
Number of columns: 3
About the rating Dataframe: 


Unnamed: 0,user_id,rating
count,1048575.0,1048575.0
mean,128508.9,2.879907
std,74218.76,3.85787
min,2.0,0.0
25%,63394.0,0.0
50%,128835.0,0.0
75%,192779.0,7.0
max,278854.0,10.0


#### Merge the dataframes
For all practical purposes, User Master Data is not required. So, ignore dataframe df_user
We will take only random 12% rows otherwise, Out Of Memory error can occur.

In [14]:
df = pd.merge(rating_df, books_df, on='isbn')
df = df.sample(frac=0.012, random_state=42)
df.reset_index(drop = True, inplace = True)
df

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,76352,449201651,0,Flower of Love (Fawcett Crest Book),Janet L. Roberts,1983,Fawcett Books
1,55736,1565122062,5,A Virtuous Woman,Kaye Gibbons,1997,Algonquin Books of Chapel Hill
2,192093,151004897,0,Scar Vegas: And Other Stories,Tom Paine,2000,Harcourt
3,185360,805063897,10,Nickel and Dimed: On (Not) Getting By in America,Barbara Ehrenreich,2002,Owl Books
4,87141,1565920317,6,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly
...,...,...,...,...,...,...,...
11289,230522,312169787,9,The Red Tent : A Novel,Anita Diamant,1997,St. Martin's Press
11290,278418,590489100,0,Around the World In a Hundred Years,Jean Fritz,0,Scholastic Inc
11291,219008,684196425,7,Domesticity: A Gastronomic Interpretation of Love,Bob Shacochis,1994,Simon &amp; Schuster
11292,235935,1885865309,8,Nymph,Francesca Lia Block,2000,Circlet Pr


### 4: Take a quick look at the number of unique users and books
we are useing the nunique() function to find the number of unique users and books in the dataset.

In [15]:
num_users = df['user_id'].nunique()
num_books = df['isbn'].nunique()

print("Number of unique users:", num_users)
print("Number of unique books:", num_books)

Number of unique users: 5497
Number of unique books: 9598


### 5: Convert ISBN variables to numeric numbers in the correct order

 This step ensures that the ISBN values are represented as numeric values.

In [16]:
isbn_list = df.isbn.unique()
def get_isbn_numeric_id(isbn):
    return np.where(isbn_list == isbn)[0][0]

### 6: Convert the user_id variable to numeric numbers in the correct order

This step ensures that the user_ids are represented as numeric values.

In [17]:
userid_list = df.user_id.unique()
def get_user_id_numeric_id(user_id):
    return np.where(userid_list == user_id)[0][0]

### 7: Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1
Convert both 'user_id' and 'isbn' to ordered lists, i.e., from 0 to n-1.

In [18]:
df['user_id_order'] = df['user_id'].apply(get_user_id_numeric_id)
df['isbn_id'] = df['isbn'].apply(get_isbn_numeric_id)
df

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_id
0,76352,449201651,0,Flower of Love (Fawcett Crest Book),Janet L. Roberts,1983,Fawcett Books,0,0
1,55736,1565122062,5,A Virtuous Woman,Kaye Gibbons,1997,Algonquin Books of Chapel Hill,1,1
2,192093,151004897,0,Scar Vegas: And Other Stories,Tom Paine,2000,Harcourt,2,2
3,185360,805063897,10,Nickel and Dimed: On (Not) Getting By in America,Barbara Ehrenreich,2002,Owl Books,3,3
4,87141,1565920317,6,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,4,4
...,...,...,...,...,...,...,...,...,...
11289,230522,312169787,9,The Red Tent : A Novel,Anita Diamant,1997,St. Martin's Press,203,6203
11290,278418,590489100,0,Around the World In a Hundred Years,Jean Fritz,0,Scholastic Inc,92,9594
11291,219008,684196425,7,Domesticity: A Gastronomic Interpretation of Love,Bob Shacochis,1994,Simon &amp; Schuster,1350,9595
11292,235935,1885865309,8,Nymph,Francesca Lia Block,2000,Circlet Pr,2381,9596


### 8. Re-index the columns to build a matrix

In [19]:
new_col_order = ['user_id_order', 'isbn_id', 'rating', 'book_title', 'book_author', 'year_of_publication', 'publisher', 'isbn', 'user_id']
df = df.reindex(columns=new_col_order)
df

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Flower of Love (Fawcett Crest Book),Janet L. Roberts,1983,Fawcett Books,449201651,76352
1,1,1,5,A Virtuous Woman,Kaye Gibbons,1997,Algonquin Books of Chapel Hill,1565122062,55736
2,2,2,0,Scar Vegas: And Other Stories,Tom Paine,2000,Harcourt,151004897,192093
3,3,3,10,Nickel and Dimed: On (Not) Getting By in America,Barbara Ehrenreich,2002,Owl Books,805063897,185360
4,4,4,6,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,1565920317,87141
...,...,...,...,...,...,...,...,...,...
11289,203,6203,9,The Red Tent : A Novel,Anita Diamant,1997,St. Martin's Press,312169787,230522
11290,92,9594,0,Around the World In a Hundred Years,Jean Fritz,0,Scholastic Inc,590489100,278418
11291,1350,9595,7,Domesticity: A Gastronomic Interpretation of Love,Bob Shacochis,1994,Simon &amp; Schuster,684196425,219008
11292,2381,9596,8,Nymph,Francesca Lia Block,2000,Circlet Pr,1885865309,235935


### 9. Split your data into two sets (training and testing)
- Split the data into training and testing sets.
- Prepare the data for building user-book matrices for training and testing.
- Using sklearn train_test_split function

In [21]:
train_data, test_data = train_test_split(df, test_size=0.20)
train_data

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
2946,376,2777,0,Shapes (Slide 'n Seek),Chuck Murphy,2001,Little Simon,689844778,16795
8341,4359,7316,7,Chasing the Dime,Michael Connelly,2002,"Little, Brown",316153915,245157
3431,876,3200,0,The One Tree (The Second Chronicles of Thomas ...,Stephen R. Donaldson,1993,Del Rey Books,345348699,21014
10512,3892,8994,0,High Hearts,Rita Mae Brown,1987,Bantam Books,553261258,116210
10959,2023,9351,5,Comeback,Dick Francis,1991,Putnam Publishing Group,399136703,106821
...,...,...,...,...,...,...,...,...,...
7433,2816,6603,0,No Place to Learn: Why Universities Aren't Wor...,Tom Pocklington,2002,Univ of Washington Pr,774808780,114868
9302,1972,8061,9,"Sir Gawain and the Green Knight, Pearl, Sir Orfeo",J. R. R. Tolkien,1988,Del Rey Books,345277600,107951
1240,970,1202,0,The Hitchhiker's Guide to the Galaxy,Douglas Adams,1989,Harmony,517542099,66942
11123,3406,9478,0,"Highland Heaven (Harlequin Historical, No 269)",Ruth Langan,1995,Harlequin,373288697,112001


In [22]:
test_data

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
5229,92,4768,0,Pictures of Perfection: A Dalziel/Pascoe Myste...,Reginald Hill,1994,Bantam Dell Pub Group,385312709,278418
5570,482,5058,10,"Page (Protector of the Small, 2)",Tamora Pierce,2000,Random House Children's Books,679889159,110483
9876,269,8498,8,Herb Book (Herb Book),John Lust,1982,Bantam Doubleday Dell,553238272,158295
10215,5083,8763,9,Dance Hall of the Dead (Joe Leaphorn Novels),Tony Hillerman,1990,HarperTorch,61000027,136755
6515,2091,5835,0,A Bachelor and a Baby (The Mom Squad),Marie Ferrarella,2003,Harlequin,373765037,107021
...,...,...,...,...,...,...,...,...,...
4521,1698,4176,0,Words to Love by,Mother Teresa,1985,Walker Large Print,802724787,181176
7633,203,6763,0,Hide and Seek,Lorraine P. De Sosa,1992,Berkley Pub Group (Mm),1557736588,230522
10271,5104,8805,0,Land Girls: Film Tie-In,Angela Huth,1998,Little Brown and Company,349109931,190374
488,431,482,0,C Is for Corpse (Kinsey Millhone Mysteries (Pa...,Sue Grafton,1987,Bantam,553280368,127131


### 10. Make predictions based on user and item variables

#### Memory-Based Collaborative Filtering
- Use the pairwise_distances function from sklearn to calculate the cosine similarity.
- Build user-book matrices for training and testing.
- Make predictions using the user-based and item-based collaborative filtering approach.

##### Create user-book matrix for training

In [24]:
train_data_matrix = np.zeros((num_users, num_books))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

#####  Create user-book matrix for testing

In [25]:
test_data_matrix = np.zeros((num_users, num_books))
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

In [26]:
# Calculate similarity using pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [27]:
# Make predictions
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

### 11. Use RMSE to evaluate the predictions

##### Root Mean Squared Error (RMSE)
- Use the mean_squared_error function and calculate RMSE for both user-based and item-based collaborative filtering.
- Print the RMSE values for both approaches.

In [28]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

# Print RMSE value for user-based and item-based collaborative filtering
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 7.875131892934845
Item-based CF RMSE: 7.875112528219741


#### Conclusion
Provide a concluding section summarizing the key findings and the effectiveness of the user-based and item-based collaborative filtering in generating book recommendations for users.