In [96]:
import pandas as pd
from random import randint

def generate_data(n_books = 3000, n_genres = 10, n_authors = 450, n_publishers = 50, n_readers = 30000, dataset_size = 100000):
    
    d = pd.DataFrame(
        {
            'book_id' : [randint(1, n_books) for _ in range(dataset_size)],
            'author_id' : [randint(1, n_authors) for _ in range(dataset_size)],
            'book_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'reader_id' : [randint(1, n_readers) for _ in range(dataset_size)],
            'num_pages' : [randint(75, 700) for _ in range(dataset_size)],
            'book_rating' : [randint(1, 10) for _ in range(dataset_size)],
            'publisher_id' : [randint(1, n_publishers) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)],
            'book_price' : [randint(1, 200) for _ in range(dataset_size)],
            'text_lang' : [randint(1,7) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d
  
df = generate_data(dataset_size = 100000)
df.to_csv('data.csv', index = False)

In [97]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,2231,208,1,3261,494,10,14,2003,80,5
1,583,311,4,6241,567,8,26,2006,8,7
2,2941,54,7,311,541,4,11,2019,84,2
3,2533,137,3,15749,261,5,23,2016,152,6
4,2617,282,6,27295,533,10,8,2018,62,3


In [98]:
df = df.sort_values(by=['book_id'], ascending = True)

In [99]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
45890,1,52,10,4729,253,6,35,2021,85,5
35501,1,15,3,14280,296,5,12,2011,199,4
31644,1,313,3,3193,221,10,11,2016,13,2
51129,1,7,6,16408,330,3,48,2016,13,2
88707,1,67,10,3079,546,2,42,2003,147,5


In [100]:
# normalizing the data to 0s and 1s
import numpy as np
'''
FORMULA
X(NORMALIZED) = (X - Xminimum)/(Xmaximum - Xminimum)

 min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    
    return [x/max_val for x in data]

'''
def normalize(data):    
    return (data - np.min(data)/ (np.max(data)- np.min(data)))

In [101]:
#performing one hot encoding
def encoding(df, column):
    new = pd.get_dummies(df[column])
    new.reset_index(drop = True, inplace = True)
    return pd.concat([df, new], axis = 1)

In [102]:
''' 
Creating a Conten based recommendation syatem as the base system

'''

class Recommender():
    def __init__(self,df):
        self.df = df
    
    def similarity(self, v1, v2):
        return np.sum(np.dot(v1,v2)/ np.cross(v1,v2)) 
    
    def recommend(self, book_id, rec):
        ip = self.df.loc[book_id].values
        self.df['sim'] = self.df.apply(lambda x: self.similarity(ip, x.values), axis =1)
        
        return self.df.nlargest(columns = 'sim', n = rec)

In [103]:
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang,num_pages_norm,book_rating_norm,book_price_norm
45890,1,52,10,4729,253,6,35,2021,85,5,252.88,5.888889,84.994975
35501,1,15,3,14280,296,5,12,2011,199,4,295.88,4.888889,198.994975
31644,1,313,3,3193,221,10,11,2016,13,2,220.88,9.888889,12.994975
51129,1,7,6,16408,330,3,48,2016,13,2,329.88,2.888889,12.994975
88707,1,67,10,3079,546,2,42,2003,147,5,545.88,1.888889,146.994975


In [104]:
df = encoding(df = df, column = 'publish_year')
df = encoding(df = df, column = 'book_genre')
df = encoding(df = df, column = 'text_lang')

#dropping redndant columns
cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']
df.drop(columns = cols, inplace = True)

In [105]:
df.head()

Unnamed: 0,book_id,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,...,8,9,10,1,2,3,4,5,6,7
45890,1,52,4729,35,252.88,5.888889,84.994975,0,0,0,...,0,0,0,0,0,0,0,0,1,0
35501,1,15,14280,12,295.88,4.888889,198.994975,0,0,0,...,1,0,0,0,0,1,0,0,0,0
31644,1,313,3193,11,220.88,9.888889,12.994975,0,0,0,...,0,0,0,0,0,0,0,1,0,0
51129,1,7,16408,48,329.88,2.888889,12.994975,0,0,0,...,0,0,0,1,0,0,0,0,0,0
88707,1,67,3079,42,545.88,1.888889,146.994975,0,0,0,...,0,0,0,0,0,0,0,1,0,0
