# Data Pre-Processing 

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Book Dataset Cleaning 

In [2]:
books = pd.read_csv('books_data/books.csv' , sep=";", on_bad_lines="skip", encoding="latin-1",low_memory=False )

In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
books.shape

(271360, 8)

In [5]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
books.duplicated().sum()

0

In [7]:
books.nunique()

ISBN                   271360
Book-Title             242135
Book-Author            102022
Year-Of-Publication       118
Publisher               16807
Image-URL-S            271044
Image-URL-M            271044
Image-URL-L            271041
dtype: int64

In [8]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [9]:
books['Year-Of-Publication'].unique()

array(['2002', '2001', '1991', '1999', '2000', '1993', '1996', '1988',
       '2004', '1998', '1994', '2003', '1997', '1983', '1979', '1995',
       '1982', '1985', '1992', '1986', '1978', '1980', '1952', '1987',
       '1990', '1981', '1989', '1984', '0', '1968', '1961', '1958',
       '1974', '1976', '1971', '1977', '1975', '1965', '1941', '1970',
       '1962', '1973', '1972', '1960', '1966', '1920', '1956', '1959',
       '1953', '1951', '1942', '1963', '1964', '1969', '1954', '1950',
       '1967', '2005', '1957', '1940', '1937', '1955', '1946', '1936',
       '1930', '2011', '1925', '1948', '1943', '1947', '1945', '1923',
       '2020', '1939', '1926', '1938', '2030', '1911', '1904', '1949',
       '1932', '1928', '1929', '1927', '1931', '1914', '2050', '1934',
       '1910', '1933', '1902', '1924', '1921', '1900', '2038', '2026',
       '1944', '1917', '1901', '2010', '1908', '1906', '1935', '1806',
       '2021', '2012', '2006', 'DK Publishing Inc', 'Gallimard', '1909',
       

## Age has unusual values --may affect later 

In [10]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].replace({'DK Publishing Inc': np.nan, 
                                                                     'Gallimard': np.nan})

In [11]:
books.dropna(inplace=True)

In [12]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64

# Renaming columns for simplicity

In [13]:
books.rename(columns ={
    "Book-Title":"Title",
    "Book-Author":"Author",
    "Year-Of-Publication":"Year",
    "Image-URL-M":"Img_url"
}, inplace =True)
books.head(2)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Image-URL-S,Img_url,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [14]:
df = books.drop(columns =['Image-URL-S','Image-URL-L'])

In [15]:
df.dtypes

ISBN         object
Title        object
Author       object
Year         object
Publisher    object
Img_url      object
dtype: object

In [16]:
df['Year']= df['Year'].astype(int)
df.dtypes

ISBN         object
Title        object
Author       object
Year          int32
Publisher    object
Img_url      object
dtype: object

In [58]:
df.columns

Index(['ISBN', 'Title', 'Author', 'Year', 'Publisher', 'Img_url'], dtype='object')

In [64]:
users =pd.read_csv('books_data/users.csv', sep=";", on_bad_lines="skip", encoding="latin-1",low_memory=False)

## Explore User's Dataset

In [65]:
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [66]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [67]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [68]:
users.describe()

Unnamed: 0,User-ID,Age
count,278858.0,168096.0
mean,139429.5,34.751434
std,80499.51502,14.428097
min,1.0,0.0
25%,69715.25,24.0
50%,139429.5,32.0
75%,209143.75,44.0
max,278858.0,244.0


# Age minimum=0 and Maximum=244 not possible --scale them 

In [69]:
users.loc[:, 'Age'] = users['Age'].fillna(users['Age'].median())

In [70]:
users.loc[(users['Age'] < 8) | (users['Age'] > 90), 'Age'] = np.nan
# users['Age'] = users['Age'].mask((users['Age'] < 8) | (users['Age'] > 90), np.nan)

In [71]:
users.isnull().sum()

User-ID        0
Location       0
Age         1383
dtype: int64

In [72]:
users.dropna(inplace=True)

In [73]:
users.isnull().sum()

User-ID     0
Location    0
Age         0
dtype: int64

In [74]:
users.describe()

Unnamed: 0,User-ID,Age
count,277475.0,277475.0
mean,139425.683391,33.643889
std,80491.785642,10.607856
min,1.0,8.0
25%,69713.5,29.0
50%,139415.0,32.0
75%,209127.5,35.0
max,278858.0,90.0


In [75]:
users.shape

(277475, 3)

In [76]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",32.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",32.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",32.0


In [77]:
users[['city', 'country']] = users['Location'].apply(lambda x: pd.Series(x.split(",")[-2:])).fillna("-1")
users['city'] = users['city'].str.strip().str.lower()
users['country'] = users['country'].str.strip().str.lower()

print(users[['Location', 'city', 'country']].head())

                             Location             city         country
0                  nyc, new york, usa         new york             usa
1           stockton, california, usa       california             usa
2     moscow, yukon territory, russia  yukon territory          russia
3           porto, v.n.gaia, portugal         v.n.gaia        portugal
4  farnborough, hants, united kingdom            hants  united kingdom


In [79]:
users=users.drop(columns=['Location'])

In [80]:
users.head(2)

Unnamed: 0,User-ID,Age,city,country
0,1,32.0,new york,usa
1,2,18.0,california,usa


In [81]:
users['country'].value_counts().head(15)

country
usa               139205
canada             21589
united kingdom     18421
germany            16940
spain              13060
australia          11685
italy              11209
                    4558
france              3431
portugal            3288
new zealand         3077
netherlands         3004
switzerland         1730
brazil              1655
china               1482
Name: count, dtype: int64

In [87]:
country_counts = users['country'].value_counts()  # Count occurrences of each country
users.loc[(users['country'].map(country_counts) < 10000) | (users['country'] == ''), 'country'] = "other"
users['country'].value_counts()

country
usa               139205
other              45366
canada             21589
united kingdom     18421
germany            16940
spain              13060
australia          11685
italy              11209
Name: count, dtype: int64

In [84]:
users['city'].value_counts().head(25)

city
california             19870
n/a                    12493
england                10762
ontario                 8732
texas                   8389
new york                7875
florida                 7019
pennsylvania            6089
illinois                5863
washington              5815
british columbia        5425
ohio                    4675
michigan                4586
oregon                  4326
virginia                4315
massachusetts           3964
missouri                3925
nordrhein-westfalen     3880
north carolina          3724
                        3647
new jersey              3587
victoria                3369
new south wales         3361
georgia                 3356
minnesota               3077
Name: count, dtype: int64

In [88]:
users.columns

Index(['User-ID', 'Age', 'city', 'country'], dtype='object')

In [90]:
df_users = users[['User-ID', 'Age', 'country']]

In [91]:
df_users.head(2)

Unnamed: 0,User-ID,Age,country
0,1,32.0,usa
1,2,18.0,usa


In [92]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277475 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   User-ID  277475 non-null  int64  
 1   Age      277475 non-null  float64
 2   country  277475 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 8.5+ MB


# Ratings dataset Preprocessing

In [93]:
rating =pd.read_csv('books_data/ratings.csv',sep=";", on_bad_lines="skip", encoding="latin-1",low_memory=False)

In [94]:
rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [95]:
rating.duplicated().sum()

0

In [107]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1028948 entries, 10 to 1149775
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1028948 non-null  int64 
 1   ISBN         1028948 non-null  object
 2   Book-Rating  1028948 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 31.4+ MB


In [109]:
rating['Book-Rating'].max()

10

In [96]:
rating.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [99]:
rating['User-ID'].value_counts().nlargest(10).index

Index([11676, 198711, 153662, 98391, 35859, 212898, 278418, 76352, 110973,
       235105],
      dtype='int64', name='User-ID')

In [106]:
#we have to remove with very less ratings as they don't read much
user_counts = rating['User-ID'].value_counts()
active_users = user_counts[user_counts >= 5].index
rating = rating[rating['User-ID'].isin(active_users)]

In [111]:
num_ratings = rating['ISBN'].value_counts()
got_ratings = num_ratings[num_ratings >= 50].index
rating = rating[rating['ISBN'].isin(got_ratings)]

In [113]:
rating.shape

(191847, 3)

In [115]:
rating.to_csv("processed_ratings.csv", index=False)

In [117]:
df.to_csv("processed_books.csv", index=False)

In [118]:
df_users.to_csv("processed_users.csv", index=False)

# Model Building -- Hybrid Approach (CF+CBF)

In [121]:
ratings_matrix = rating.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')
ratings_matrix = ratings_matrix.fillna(0)

In [122]:
from sklearn.metrics.pairwise import cosine_similarity
book_similarity = cosine_similarity(ratings_matrix.T)
book_similarity_df = pd.DataFrame(book_similarity, index=ratings_matrix.columns, columns=ratings_matrix.columns)

In [127]:
book_similarity_df

ISBN,000649840X,002026478X,0020442203,002542730X,0028604199,006000438X,0060008032,0060008776,006001203X,0060085444,...,157322930X,1573229326,1573229571,1573229725,1576737330,1592400876,1878424319,1885171080,1931561648,8873122933
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,1.000000,0.000000,0.034797,0.074882,0.000000,0.083950,0.035959,0.0,0.026934,0.0,...,0.000000,0.021980,0.024002,0.041393,0.000000,0.026454,0.022286,0.000000,0.038231,0.000000
002026478X,0.000000,1.000000,0.000000,0.000000,0.000000,0.051100,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.034441,0.000000,0.000000,0.000000,0.000000
0020442203,0.034797,0.000000,1.000000,0.075568,0.031304,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.023042,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
002542730X,0.074882,0.000000,0.075568,1.000000,0.247545,0.013034,0.017911,0.0,0.000000,0.0,...,0.026054,0.049007,0.031882,0.000000,0.034889,0.028794,0.000000,0.000000,0.019043,0.000000
0028604199,0.000000,0.000000,0.031304,0.247545,1.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.019465,0.000000,0.000000,0.021471,0.026773,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592400876,0.026454,0.034441,0.000000,0.028794,0.026773,0.020528,0.053286,0.0,0.025357,0.0,...,0.000000,0.036951,0.033177,0.061339,0.028179,1.000000,0.000000,0.000000,0.029993,0.000000
1878424319,0.022286,0.000000,0.000000,0.000000,0.000000,0.019764,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.059181,0.000000,0.057893
1885171080,0.000000,0.000000,0.000000,0.000000,0.000000,0.106399,0.032493,0.0,0.000000,0.0,...,0.000000,0.035751,0.000000,0.035908,0.000000,0.000000,0.059181,1.000000,0.000000,0.000000
1931561648,0.038231,0.000000,0.000000,0.019043,0.000000,0.029667,0.040770,0.0,0.000000,0.0,...,0.000000,0.056962,0.027214,0.000000,0.000000,0.029993,0.000000,0.000000,1.000000,0.000000


In [130]:
def recommend_books_cf(isbn, book_similarity_df, top_n=10):
    sim_scores = book_similarity_df[isbn].sort_values(ascending=False)
    top_similar_books = sim_scores[1:top_n+1]
    return df[df['ISBN'].isin(top_similar_books.index)]['Title']
recommend_books_cf('000649840X', book_similarity_df)

1263                                Memoirs of a Geisha Uk
1721     As Nature Made Him: The Boy Who Was Raised As ...
6396                                 Bridget Jones's Diary
6758                                 84 Charing Cross Road
10519                                             Stargirl
10622     Blood and Gold (Rice, Anne, Vampire Chronicles.)
26698                                  Honor Among Thieves
35059                                          Dead Famous
Name: Title, dtype: object

In [131]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
ratings_matrix_sparse = csr_matrix(ratings_matrix)
book_similarity_sparse = cosine_similarity(ratings_matrix_sparse.T, ratings_matrix_sparse.T)

In [132]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.decomposition import TruncatedSVD
# from sklearn.feature_extraction.text import CountVectorizer
# from scipy.sparse import csr_matrix

# # Function to calculate content-based similarity using CountVectorizer
# def content_based_filtering(books, ratings):
#     # Preprocessing: Use book features (Author, Year) for content-based filtering
#     books = books[['ISBN', 'Author', 'Year']]  # You can add more features here
    
#     # Fill missing values with placeholders
#     books['Author'] = books['Author'].fillna('Unknown')
#     books['Year'] = books['Year'].fillna(0).astype(str)

#     # Combine 'Author' and 'Year' to form a content-based feature (you can add more features)
#     books['content'] = books['Author'] + ' ' + books['Year']
    
#     # Use CountVectorizer for feature extraction (more memory-efficient than TF-IDF)
#     count_vectorizer = CountVectorizer(stop_words='english', lowercase=True)
#     content_matrix = count_vectorizer.fit_transform(books['content'])
    
#     # Compute cosine similarity between books (using sparse matrix)
#     cosine_sim = cosine_similarity(content_matrix, content_matrix)
    
#     return cosine_sim, books

# # Function to calculate collaborative filtering using matrix factorization
# def collaborative_filtering(ratings, n_components=50):
#     # Create a user-item matrix (using ISBN and User-ID)
#     user_item_matrix = ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating')

#     # Fill NaN values with 0 (since no rating means 0 preference)
#     user_item_matrix = user_item_matrix.fillna(0)

#     # Convert to a sparse matrix for efficiency
#     sparse_matrix = csr_matrix(user_item_matrix)

#     # Apply matrix factorization (SVD)
#     svd = TruncatedSVD(n_components=n_components)
#     latent_matrix = svd.fit_transform(sparse_matrix)

#     # Compute similarity between items
#     item_similarity_matrix = cosine_similarity(latent_matrix.T)

#     return item_similarity_matrix, user_item_matrix

# # Function to generate hybrid recommendations
# def hybrid_recommendation(user_id, books, ratings, content_weight=0.5, collaborative_weight=0.5):
#     # Step 1: Calculate content-based similarity
#     content_sim, _ = content_based_filtering(books, ratings)
    
#     # Step 2: Calculate collaborative filtering similarity
#     collaborative_sim, user_item_matrix = collaborative_filtering(ratings)

#     # Get the books the user has already rated
#     user_ratings = user_item_matrix.loc[user_id]

#     # Step 3: Generate predictions using content-based and collaborative methods
#     content_based_scores = content_sim.dot(user_ratings) / np.array([np.abs(content_sim).sum(axis=1)]).T
#     collaborative_based_scores = collaborative_sim.dot(user_ratings) / np.array([np.abs(collaborative_sim).sum(axis=1)]).T

#     # Step 4: Combine both recommendations (content-based and collaborative)
#     hybrid_scores = content_weight * content_based_scores + collaborative_weight * collaborative_based_scores

#     # Step 5: Get the top N book recommendations
#     recommended_books_idx = np.argsort(hybrid_scores, axis=0)[::-1][:10]
    
#     recommended_books = books.iloc[recommended_books_idx.flatten()]
#     return recommended_books

# # Example Usage:
# # Assuming 'books', 'ratings' are your preprocessed datasets

# # Example: Get book recommendations for user with ID 1
# user_id = 1
# recommended_books = hybrid_recommendation(user_id, df , rating)
# print(recommended_books[['ISBN', 'Title', 'Author']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['Author'] = books['Author'].fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['Year'] = books['Year'].fillna(0).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['content'] = books['Author'] + ' ' + books['Year']


MemoryError: Unable to allocate 25.6 GiB for an array with shape (3439760526,) and data type int64

MemoryError: Unable to allocate 777. MiB for an array with shape (31000, 3286) and data type int64