# Hybrid Recommendation using
a) Collaborative Filtering Model

b) Content-based Model

In [1]:
import pandas as pd
from random import randint

def generate_data(n_books = 3000, n_genres = 10, n_authors = 450, n_publishers = 50, n_readers = 30000, dataset_size = 100000):
    
    d = pd.DataFrame(
        {
            'book_id' : [randint(1, n_books) for _ in range(dataset_size)],
            'author_id' : [randint(1, n_authors) for _ in range(dataset_size)],
            'book_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'reader_id' : [randint(1, n_readers) for _ in range(dataset_size)],
            'num_pages' : [randint(75, 700) for _ in range(dataset_size)],
            'book_rating' : [randint(1, 10) for _ in range(dataset_size)],
            'publisher_id' : [randint(1, n_publishers) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)],
            'book_price' : [randint(1, 200) for _ in range(dataset_size)],
            'text_lang' : [randint(1,7) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d

In [2]:
df = generate_data(dataset_size = 100000)
df.to_csv('data.csv', index = False)
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,765,100,3,26266,304,2,49,2005,38,2
1,2055,380,4,10768,583,10,22,2021,194,2
2,2193,212,1,17427,566,4,49,2019,89,4
3,2356,312,6,23592,122,2,14,2003,163,1
4,791,231,7,8943,391,1,13,2010,125,7


In [3]:
df = df.sort_values(by=['book_id'], ascending = True)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
4631,1,219,7,20937,336,9,30,2015,61,3
85642,1,45,7,10847,229,7,9,2002,102,3
23550,1,33,8,11374,473,7,11,2010,44,5
39830,1,426,5,16215,194,5,18,2012,83,2
74496,1,201,10,2176,450,3,49,2007,170,2


In [5]:
import numpy as np
'''
FORMULA
X(NORMALIZED) = (X - Xminimum)/(Xmaximum - Xminimum)

 min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    
    return [x/max_val for x in data]
'''
#normalizing the input between 0s and 1s
def normalize(data):    
    return (data - np.min(data)/ (np.max(data)- np.min(data)))


In [6]:
''' 
CREATING FUNCTIONS FOR CONTENT BASED RECOMMENDATION MODELS

''' 

#performing one hot encoding
def encoding(df, column):
    new = pd.get_dummies(df[column])
    new.reset_index(drop = True, inplace = True)
    return pd.concat([df, new], axis = 1)

In [8]:
'''
CREATING FRUNCTION FOR COLLABORATIVE FILTER MODELLING
'''
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

def svd(mat, df, factors):
    if not 1<= factors < min(mat.shape):
        raise ValueError("Must be 1 <= factors < min(mat.shape)")
    
    #matrix factorization
    u,s,v = svds(mat, k = factors)
    s = np.diag(s)

    #calculating the prediction ratings
    pred = np.dot(np.dot(u,s),v)
    pred = normalize(pred) #normalizing the predictions

    new_df = pd.DataFrame(pred, columns = df.columns, index = list(df.index) ).transpose()

    return new_df

def similarity(v1, v2):
        return np.sum(np.dot(v1,v2)/ np.cross(v1,v2)) 


In [44]:
''' 
Creating a Content based recommendation syatem as the base system

class Recommender():
    def __init__(self,df):
        self.df = df
    
    
    
    def recommend(self, book_id, rec):
        ip = self.df.loc[book_id].values
        self.df['sim'] = self.df.apply(lambda x: self.similarity(ip, x.values), axis =1)
        
        return self.df.nlargest(columns = 'sim', n = rec)
        
'''

" \nCreating a Content based recommendation syatem as the base system\n\nclass Recommender():\n    def __init__(self,df):\n        self.df = df\n    \n    def similarity(self, v1, v2):\n        return np.sum(np.dot(v1,v2)/ np.cross(v1,v2)) \n    \n    def recommend(self, book_id, rec):\n        ip = self.df.loc[book_id].values\n        self.df['sim'] = self.df.apply(lambda x: self.similarity(ip, x.values), axis =1)\n        \n        return self.df.nlargest(columns = 'sim', n = rec)\n        \n"

In [45]:
''' 
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)


df = encoding(df = df, column = 'publish_year')
df = encoding(df = df, column = 'book_genre')
df = encoding(df = df, column = 'text_lang')

#dropping redndant columns
cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']
df.drop(columns = cols, inplace = True)

''' 

" \ndf['num_pages_norm'] = normalize(df['num_pages'].values)\ndf['book_rating_norm'] = normalize(df['book_rating'].values)\ndf['book_price_norm'] = normalize(df['book_price'].values)\n\n\ndf = encoding(df = df, column = 'publish_year')\ndf = encoding(df = df, column = 'book_genre')\ndf = encoding(df = df, column = 'text_lang')\n\n#dropping redndant columns\ncols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']\ndf.drop(columns = cols, inplace = True)\n\n"

In [13]:
''' 
CREATING THE FINAL HYBRID MODEL

''' 

def hybrid(reader_id, book_id, data, n_recs, cosine, svd_model):

    # similarity values

    s = list(enumerate(cosine[int(book_id)]))
    s = sorted(s, key = lambda x:x[1], reverse = True)

    # metadeta
    index = [i[0] for i in s]
    books = data.iloc[index][['book_id', 'book_rating', 'num_pages', 'publish_year', 'book_price', 'reader_id']]

    #applying the model

    books['predicted'] = books.apply(lambda x: svd_model.predict(reader_id, x['book_id'], x['book_rating']).est, axis = 1)
    
    #sorting

    books = books.sort_values('est', ascending = False)

    return books.head(n_recs)