In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
from mlxtend.frequent_patterns import fpgrowth

In [2]:
#load the dataset
df_books = pd.read_csv('Books.csv', delimiter = ';')
df_books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [3]:
df_ratings = pd.read_csv('Ratings.csv', delimiter = ';')
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
df_users = pd.read_csv('Users.csv', delimiter = ';')
df_users.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,User-ID,Age
0,1,
1,2,18.0
2,3,
3,4,17.0
4,5,


In [5]:
merged = pd.merge(df_books, df_ratings, on = "ISBN")
merged.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,User-ID,Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0


In [6]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031187 entries, 0 to 1031186
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   ISBN       1031187 non-null  object
 1   Title      1031187 non-null  object
 2   Author     1031186 non-null  object
 3   Year       1031187 non-null  int64 
 4   Publisher  1031185 non-null  object
 5   User-ID    1031187 non-null  int64 
 6   Rating     1031187 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 62.9+ MB


In [7]:
merged_drop = merged.drop(columns = ['ISBN', 'Publisher', 'Year', 'Author'])
merged_drop.head()

Unnamed: 0,Title,User-ID,Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0


In [8]:
#get only the books user actually liked, which means the rate is more than 5.
filtered = merged_drop[(merged_drop['Rating'] >= 5)]

In [9]:
#For each user get a list of the books the user liked. 
tra_list = filtered.groupby(by = ["User-ID"])["Title"].apply(list).reset_index()

In [10]:
tra_list = tra_list["Title"].to_list()
print(tra_list)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
#convert to transactions, so the fpgrowth algorithm can deal with it.
te = TransactionEncoder()
test_size = int(len(tra_list) * 0.2)
te_ary = te.fit_transform(tra_list)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [12]:
# split to 80% train and 20% test
df_test = df[0:test_size]
df = df[test_size:len(tra_list)]

In [15]:
# get the rules from dataset, in the from of A -> B
# that means if a user liked the set A of books, he will like the set B with some score. 
# we considered the lift to be the score
frequent_itemsets = fpgrowth(df, min_support=0.0001, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Beloved (Plume Contemporary Fiction)),(House of Sand and Fog),0.001124,0.003462,0.000121,0.108108,31.227596,0.000118,1.117331
1,(House of Sand and Fog),(Beloved (Plume Contemporary Fiction)),0.003462,0.001124,0.000121,0.035088,31.227596,0.000118,1.035199
2,(Beloved (Plume Contemporary Fiction)),(The Color of Water: A Black Man's Tribute to ...,0.001124,0.001640,0.000121,0.108108,65.924925,0.000120,1.119373
3,(The Color of Water: A Black Man's Tribute to ...,(Beloved (Plume Contemporary Fiction)),0.001640,0.001124,0.000121,0.074074,65.924925,0.000120,1.078786
4,(To Kill a Mockingbird),(Beloved (Plume Contemporary Fiction)),0.004009,0.001124,0.000106,0.026515,23.598127,0.000102,1.026083
...,...,...,...,...,...,...,...,...,...
402289,"(The Doll's House (Sandman, Book 2))","(Season of Mists (Sandman, Book 4), Preludes a...",0.000228,0.000106,0.000106,0.466667,4390.600000,0.000106,1.874801
402290,"(Season of Mists (Sandman, Book 4))","(The Doll's House (Sandman, Book 2), Preludes ...",0.000137,0.000182,0.000106,0.777778,4268.638889,0.000106,4.499180
402291,"(Preludes and Nocturnes (Sandman, Book 1))","(The Doll's House (Sandman, Book 2), Season of...",0.000304,0.000106,0.000106,0.350000,3292.950000,0.000106,1.538298
402292,"(Worlds' End (Sandman, Book 8))","(Brief Lives (Sandman, Book 7))",0.000121,0.000121,0.000106,0.875000,7203.328125,0.000106,7.999028


In [16]:
user_test = tra_list[0:test_size]

In [17]:
# sort rules by lift, sot the recommendations are sorted accordingly
rules.sort_values(by=['lift'], ascending =False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
390750,"(Castle of Wizardry (The Belgariad, Book 4), G...","(Enchanters' End Game (The Belgariad, Book 5),...",0.000106,0.000106,0.000106,1.000000,9408.428571,0.000106,inf
124098,"(The Dark Half, Pet Sematary, Dolores Claiborn...","(Insomnia, Carrie, Desperation, The Dead Zone)",0.000106,0.000106,0.000106,1.000000,9408.428571,0.000106,inf
358617,(A Is for Alibi (Kinsey Millhone Mysteries (Pa...,(B Is for Burglar (Kinsey Millhone Mysteries (...,0.000106,0.000106,0.000106,1.000000,9408.428571,0.000106,inf
124093,"(The Dark Half, Pet Sematary, Dolores Claiborn...","(Insomnia, Four Past Midnight, Carrie, Christine)",0.000106,0.000106,0.000106,1.000000,9408.428571,0.000106,inf
124094,"(The Dark Half, Pet Sematary, Dolores Claiborn...","(Four Past Midnight, Carrie, Christine, The De...",0.000106,0.000106,0.000106,1.000000,9408.428571,0.000106,inf
...,...,...,...,...,...,...,...,...,...
38225,(The Lovely Bones: A Novel),(Harry Potter and the Sorcerer's Stone (Harry ...,0.010507,0.004737,0.000197,0.018786,3.965499,0.000148,1.014318
40484,(The Lovely Bones: A Novel),(Interview with the Vampire),0.010507,0.003416,0.000137,0.013006,3.806879,0.000101,1.009716
40485,(Interview with the Vampire),(The Lovely Bones: A Novel),0.003416,0.010507,0.000137,0.040000,3.806879,0.000101,1.030722
33251,(The Da Vinci Code),(Wild Animus),0.007319,0.004252,0.000106,0.014523,3.415923,0.000075,1.010423


In [18]:
print(rules['antecedents'])

0                    (Beloved (Plume Contemporary Fiction))
1                                   (House of Sand and Fog)
2                    (Beloved (Plume Contemporary Fiction))
3         (The Color of Water: A Black Man's Tribute to ...
4                                   (To Kill a Mockingbird)
                                ...                        
402289                 (The Doll's House (Sandman, Book 2))
402290                  (Season of Mists (Sandman, Book 4))
402291           (Preludes and Nocturnes (Sandman, Book 1))
402292                      (Worlds' End (Sandman, Book 8))
402293                      (Brief Lives (Sandman, Book 7))
Name: antecedents, Length: 402294, dtype: object


In [19]:
conseqs = []
for i, cons in enumerate(rules['consequents']):
    conseqs.append(list(x for x in rules['consequents'][i]))

In [20]:
ants = []
for i, ant in enumerate(rules['antecedents']):
    ants.append(list(x for x in rules['antecedents'][i]))

In [23]:
def recommend(user, n = 10):
    """
    the first parameter is the user which is a list of the movies a user liked
    the second parameter is the number of wanted recommendations
    the output is a list of n movie recommendations sorted by the most probably liked.
    """
    recommendations = set()
    oredered_recommendations = []
    for i, ant in enumerate(ants):
        check =  all(item in ant for item in user)
        if check:
            for movie in conseqs[i]:
                if movie not in recommendations:
                    oredered_recommendations.append(movie)
                    recommendations.add(movie)
                    if len(oredered_recommendations) == n:
                        return oredered_recommendations
    return oredered_recommendations

In [44]:
user = user_test[9][0:-1]
print(recommend(user))

['House of Sand and Fog', 'Beloved (Plume Contemporary Fiction)', "The Color of Water: A Black Man's Tribute to His White Mother", 'To Kill a Mockingbird', "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", 'The Secret Life of Bees', 'Jurassic Park', 'Airframe', 'A Time to Kill', 'The Chamber']


In [50]:
#calculate the hitrate
hits = 0
for user in user_test:
    movies = user[0:-1]
    recommendations = recommend(movies)
    if user[-1] in recommendations:
        hits += 1

In [51]:
print(hits/len(user_test))

0.007440589173183509
