# Content_based Recommendation System

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
!pip install rank-bm25
from rank_bm25 import BM25Okapi
import math
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
import matplotlib.pyplot as plt
import scipy.sparse as sparse
import pickle
%matplotlib inline



# 1. Load Dataset

In [8]:
# read in dataset
df = pd.read_csv('../Annotated_dataset/item_inc_lang_age.csv')
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,age
0,21310,Princess Poppy: The Big Mix Up,Janey Louise Jones,Penguin Random House Children's UK,YFB,['5AH'],en,1.25
1,73018,Einfach zeichnen! Step by Step,Wiebke Krabbe,Schwager und Steinlein,AGZ,"['5AJ', 'AGZ', 'WFA', 'YBG', 'YBL', 'YNA', 'YPA']",de,1.0
2,19194,Red Queen 1,Victoria Aveyard,Orion Publishing Group,YFH,"['5AP', 'FBA']",en,1.75
3,40250,Meine Kindergarten-Freunde (Pirat),,Ars Edition GmbH,YB,"['5AC', '5AD', 'YBG', 'YBL', 'YF']",de,0.0
4,46107,Mein großes Schablonen-Buch - Wilde Tiere,Elizabeth Golding,Edition Michael Fischer,WFTM,"['WD', 'WFTM', 'YBG', 'YBL', 'YBLD', 'YBLN1']",de,0.0


# 2. Model Building

## 2.1 Define FUnction
- def searchquery: search the specific item's information
- def calculate: calculate the similaity between titles
- def linsimilarity: a measure for similarity of two books' main topics

In [9]:
def searchquery(df, itemID, num_recommendation = 5):
    #Find the query item
    query = df.loc[df['itemID'] == itemID]
    return query

In [10]:
#function required for title simililarity calculation
def calculate_bm25(query, corpus):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    return doc_scores

In [11]:
#The Linsimilarityfunction gets two strings (two main topics), Based on Lin similarity formula, then a floating number will
#return as a measure for similarity of two books.
def linsimilarityfunction (maintopic1 , maintopic2):
    #if two topics are identical the Lin similarity measure should be 1:
    if (maintopic1 == maintopic2):
        return 1
    #if they are not identical, we should do further analysis.
    #we compare the first elements of both lists. If they differ from each other, the Lin similarity will be zero.
    elif (maintopic1[0] != maintopic2[0]):
        return 0
    #otherwise we need further steps!
    else:
        #here we simply find the frequency of each topic in our dataset as it is required by Lin similarity
        totalitems = df['main topic'].count()
        minlength=min(len(maintopic1),len(maintopic2))
        df1=df[df['main topic']==maintopic1]
        df2=df[df['main topic']==maintopic2]
        count1=df1['main topic'].count()
        count2=df2['main topic'].count()
        #Now, we can focus on finding the closest mutual ancestor of both topics. By close, we refer to number of
        #node traverse in the tree of all topics.
        i=minlength
        while (i>0):
            if (maintopic1[:i]==maintopic2[:i]):
                dfmutual=df[df['main topic'].str[:i]==maintopic1[:i]]
                countmutual=dfmutual['main topic'].count()
                sim = 2*math.log10(countmutual/totalitems)/(math.log10(count1/totalitems)+math.log10(count2/totalitems))
                return sim
            else:
                i=i-1
                
# example
print(linsimilarityfunction ('YB', 'YFB'))

0.10116165919446315


## 2.2 Data processing for age

In [12]:
#preprocessing for age
#if age = 'None', we assigned the item value 20 (as one of the member in Group others)
df['age'] = [20 if i == 'None' else i for i in df['age']]

#change data type from string to float
df['age'] = df['age'].astype(float)

## 2.3 Load the model that stores feature weights

In [13]:
from sklearn.linear_model import LogisticRegression
import pickle

def load_model():
    filename = "../Annotated_dataset/finalized_LR_model.sav"
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

In [17]:
import warnings
warnings.filterwarnings("ignore")

# 3. Content-Based Recommendation System

In [25]:
def recommend(df, itemID, num_recommendation = 5):
    
    #Find the item index
    item_index = int(df[df['itemID'] == itemID].index.values)

    #metadata for the itemID
    item_metadata = df[df['itemID'] == itemID]
    item_metadata = item_metadata.squeeze(axis=0)
    
    #0. Language: Keep data with same langauge. 
    # + remove itemID (this is the query item) from df dataframe
    df = df.loc[df['itemID'] != itemID]
    df = df.loc[df['language'] == item_metadata.language]
    
    #1. Title
    item_title = item_metadata.title #this is the query sentence
    if pd.isnull(item_title)== False:
        df['title_bm25_scores'] = calculate_bm25(item_title, df.title)
    else: 
        df['title_bm25_scores'] = 0

    
    #2. Title-Author combination best selections
    item_author = item_metadata.author
    df1 = df.loc[df['author'] == item_author]
    df2 = df.loc[df['author'] != item_author]
    df1['author_scores'] = 1
    df2['author_scores'] = 0
       
 
    #3.combining all we have for final evaluation
    df2 = df2.sort_values(by=['title_bm25_scores'], ascending=False)
    if len(df2.index) >= 1000:
        df2= df2[:1000]
    frames = [df1, df2]
    df = pd.concat(frames)

 
    #4. Calculate the closeness of age between query item and candidate items. 
    # At the end, create an new column called "age_closeness"
    #age distance
    #candidate's age minus query item's age
    df['age_distance'] = df['age'] - item_metadata['age']
    #age Normalization: use Max-min Normalization
    df['age_distance'] = (df['age_distance'] - df['age_distance'].min()) / (df['age_distance'].max() - df['age_distance'].min())
    #How close the age distance is? (Closeness = 1 - age_distance)
    df['age_closeness'] = 1 - abs(df['age_distance']) #the larger value the closer age is
    
    
    #5. Main topic: use Linsimilarity
    item_maintopic = item_metadata['main topic']
    if pd.isnull(item_maintopic)==False:
        df1 = df[df['main topic'].notna()]
        df2 = df[df['main topic'].isna()]
        df1['maintopic_scores'] = [linsimilarityfunction (item_maintopic, topic) for topic in df1['main topic']]
        df2['maintopic_scores'] = 0
        #combining all we have for final evaluation
        frames = [df1, df2]
        df = pd.concat(frames)
    else:
        df['maintopic_scores'] = 0
    
    #normalize all scores.
    df['maintopic_scores'] = (df['maintopic_scores'] - df['maintopic_scores'].min()) / (df['maintopic_scores'].max() - df['maintopic_scores'].min())
    Total = df['title_bm25_scores'].sum()
    if (Total > 0):
        df['title_bm25_scores'] = (df['title_bm25_scores'] - df['title_bm25_scores'].min()) / (df['title_bm25_scores'].max() - df['title_bm25_scores'].min())   
  
    #6. final score and limit the number of recommendations to 10
    
    model = load_model()
    features = ['author_scores', 'maintopic_scores', 'age_closeness', 'title_bm25_scores']
    
    X = df[features].values.reshape(-1, len(features))
    df['average_score'] = model.predict(X)
    
    result = df.sort_values(by=['average_score'], ascending=False)
    return result.iloc[:num_recommendation]

# 4. Use Recommendation System to Recommend 5 Books

In [26]:
query_item = searchquery(df, 445, 5) #k = 5
query_item.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,age
7663,445,DIE PHANTASTISCHE REISE,Isaac Asimov,epubli,FL,[''],de,20.0


In [27]:
rec_books = recommend(df, 445, 5) #k = 5
rec_books.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,age,title_bm25_scores,author_scores,age_distance,age_closeness,maintopic_scores,average_score
18451,51683,DIE NANOREVOLUTION,Armin M. Kittl,Books on Demand,FL,[''],de,20.0,1.0,0,1.0,0.0,1.0,800
53704,31326,DIE PSI-DROGE,Mike Dolinsky,epubli,FL,[''],de,20.0,1.0,0,1.0,0.0,1.0,800
837,5553,Die Foundation-Trilogie,Isaac Asimov,Heyne Taschenbuch,FLC,['FLS'],de,20.0,0.0,1,1.0,0.0,0.560347,700
30312,38751,Sophie and Friends,Nancy N. Rue,Zonderkidz,YXHB,"['5PGM', 'YFK']",de,1.5,0.0,0,0.075,0.925,0.0,700
30311,18348,Das Haus Zamis 34. Sonst fressen dich die Raben!,"Catalina Corvo, Susanne Wilhelm",Zaubermond Verlag,FH,['FK'],de,20.0,0.0,0,1.0,0.0,0.221152,700


In [28]:
query_item = searchquery(df, 38751, 5) #k = 5
query_item.head()

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,age
30312,38751,Sophie and Friends,Nancy N. Rue,Zonderkidz,YXHB,"['5PGM', 'YFK']",de,1.5


In [29]:
rec_books = recommend(df, 38751, 5) #k = 5
display(rec_books)

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics,language,age,title_bm25_scores,author_scores,age_distance,age_closeness,maintopic_scores,average_score
76883,24549,Sophie Under Pressure,Nancy N. Rue,Zondervan,YFK,"['5PGM', 'YX']",de,1.5,0.672357,1,0.075,0.925,0.112469,800
76884,58520,Sophie Gets Real,Nancy N. Rue,Zondervan,YFK,"['5PGM', 'YX']",de,1.5,0.672357,1,0.075,0.925,0.112469,800
37078,49929,Herr der Welt,Jules Verne,TP Verone Publishing,FB,[''],de,20.0,0.0,0,1.0,0.0,0.0,700
33059,26876,Alfie George Und Die Schokoladendiebe,Martin Holt,Lulu.com,YFB,[''],de,1.5,0.0,0,0.075,0.925,0.141352,700
36140,57147,Das Weihnachtsgeheimnis,Jostein Gaarder,dtv Verlagsgesellschaft,YFB,"['5AG', '5HPD', '1DNN', 'FV', 'YFN', 'YNR']",de,0.75,0.0,0,0.0375,0.9625,0.141352,700
