# Hybrid Recommendation System
a) Collaborative Filtering Model

b) Content-based Model

In [60]:
import pandas as pd
from random import randint
import numpy as np
from numpy.linalg import norm 
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from numpy import dot

In [61]:
'''
FORMULA
X(NORMALIZED) = (X - Xminimum)/(Xmaximum - Xminimum)

 min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    
    return [x/max_val for x in data]
'''
#normalizing the input between 0s and 1s
def normalize(data):    
    return (data - np.min(data)/ (np.max(data)- np.min(data)))


In [62]:
''' 
CREATING FUNCTIONS FOR CONTENT BASED RECOMMENDATION MODELS

''' 

#performing one hot encoding
def encoding(df, column):
    new = pd.get_dummies(df[column])
    new.reset_index(drop = True, inplace = True)
    return pd.concat([df, new], axis = 1)

class CBRecommend():
    def __init__(self, df):
        self.df = df
        
    def cosine_sim(self, v1,v2):
        '''
        This function will calculate the cosine similarity between two vectors
        '''
        return sum(dot(v1,v2)/(norm(v1)*norm(v2)))
    
    def recommend(self, book_id, n_rec):
        """
        df (dataframe): The dataframe
        song_id (string): Representing the song name
        n_rec (int): amount of rec user wants
        """
        
        # calculate similarity of input book_id vector w.r.t all other vectors
        inputVec = self.df.loc[book_id].values
        self.df['sim']= self.df.apply(lambda x: self.cosine_sim(inputVec, x.values), axis=1)

        # returns top n user specified books
        return self.df.nlargest(columns='sim',n=n_rec)

In [63]:
'''
CREATING FRUNCTION FOR COLLABORATIVE FILTER MODELLING
'''

def svd(mat, df, factors):
    if not 1<= factors < min(mat.shape):
        raise ValueError("Must be 1 <= factors < min(mat.shape)")
    
    #matrix factorization
    u,s,v = svds(mat, k = factors)
    s = np.diag(s)

    #calculating the prediction ratings
    pred = np.dot(np.dot(u,s),v)
    pred = normalize(pred) #normalizing the predictions

    new_df = pd.DataFrame(pred, columns = df.columns, index = list(df.index) ).transpose()

    return new_df


In [70]:
#generating data for various books according to book_id
def generate_data(n_books = 3000, n_genres = 10, n_authors = 450, n_publishers = 50, n_readers = 30000, dataset_size = 100000):
    
    d = pd.DataFrame(
        {
            'book_id' : [randint(1, n_books) for _ in range(dataset_size)],
            'author_id' : [randint(1, n_authors) for _ in range(dataset_size)],
            'book_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'reader_id' : [randint(1, n_readers) for _ in range(dataset_size)],
            'num_pages' : [randint(75, 700) for _ in range(dataset_size)],
            'book_rating' : [randint(1, 10) for _ in range(dataset_size)],
            'publisher_id' : [randint(1, n_publishers) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)],
            'book_price' : [randint(1, 200) for _ in range(dataset_size)],
            'text_lang' : [randint(1,7) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d

In [67]:
if __name__ == '__main__':

    df = generate_data(dataset_size = 100000)
    df.to_csv('data.csv', index = False)
    df = df.sort_values(by=['book_id'], ascending = True)
    
    df = df

    # normalizing
    df['num_pages_norm'] = normalize(df['num_pages'].values)
    df['book_rating_norm'] = normalize(df['book_rating'].values)
    df['book_price_norm'] = normalize(df['book_price'].values)
    
    # One hot encoding
    df = encoding(df = df, column = 'publish_year')
    df = encoding(df = df, column = 'book_genre')
    df = encoding(df = df, column = 'text_lang')


    # generate a pivot table
    pivot = df.pivot_table(
        columns = 'book_id',
        index = 'reader_id',
        values = 'book_rating'
    ).fillna(0)

    # convert to a csr matrix
    mat = pivot.values
    mat = csr_matrix(mat)
    
    #applying SVD model
    pred_df = svd(mat, pivot, 10)

    '''
    pred_df[pred_df.select_dtypes(include=['number']).columns] *= 10
    
    pred_df = pred_df.astype(int)
    print(pred_df)  

    '''


    # drop redundant columns
    cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']
    df.drop(columns = cols, inplace = True)
    df.set_index('book_id', inplace = True)

    
    # ran on a sample as an example
    t = df.copy()
    cbr = CBRecommend(df = t)


In [68]:
user_input = int(input("Enter the relevant book id: "))
data = cbr.recommend(book_id = t.index[0], n_rec = 5)
print_columns = data[['author_id', 'reader_id', 'publisher_id', 'num_pages_norm', 'book_rating_norm', 'book_price_norm']]
print_columns

Unnamed: 0_level_0,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,2003,...,9,10,1,2,3,4,5,6,7,sim
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
913,312,19409,36,545.88,7.888889,132.994975,0,0,0,0,...,0,0,1,0,0,0,0,0,0,4.983116
1564,377,22866,32,635.88,8.888889,169.994975,0,0,0,0,...,0,0,0,1,0,0,0,0,0,4.983115
933,347,21159,35,596.88,0.888889,154.994975,0,0,0,0,...,0,0,0,1,0,0,0,0,0,4.983115
1695,360,22905,44,621.88,2.888889,165.994975,0,0,0,0,...,0,0,0,1,0,0,0,0,0,4.983114
1608,374,24378,38,675.88,8.888889,179.994975,0,0,0,0,...,0,0,0,1,0,0,0,0,0,4.983114
