In [17]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [18]:
# load the data
df_fiction = pd.read_csv('df_fiction.csv')
df_fiction.head()

Unnamed: 0,Title,authors,categories,Id,User_id,profileName,review/helpfulness,review/score,review/text
0,Tess and the Highlander,['May Mcgoldrick'],['Juvenile Fiction'],006000486X,A2VCGJLKGK2WJJ,Rebecca Herman,17/18,5.0,Tess was washed ashore on the Isle of May duri...
1,Tess and the Highlander,['May Mcgoldrick'],['Juvenile Fiction'],006000486X,AVWFMN5CELC8Q,sarah,6/6,4.0,This is an engaging a count of life of Tess a ...
2,Tess and the Highlander,['May Mcgoldrick'],['Juvenile Fiction'],006000486X,A37XYM3KSEIDLS,"jaina_solo ""jaina_solo""",6/6,5.0,This book was a perfect historical romance for...
3,Tess and the Highlander,['May Mcgoldrick'],['Juvenile Fiction'],006000486X,A1IQK80SXVPAWW,atlantis_girl,4/4,5.0,I loved everything about this book - the setti...
4,Tess and the Highlander,['May Mcgoldrick'],['Juvenile Fiction'],006000486X,A1PI0GN5N7M3GF,Patricia Ahern,5/6,5.0,This book was spellbinding! Filled with advent...


In [19]:
# Select only relevant columns (user_id and book_id)
user_book_df = df_fiction[['User_id', 'Title']].drop_duplicates()

# Display a sample
print(user_book_df.head(20))

           User_id                               Title
0   A2VCGJLKGK2WJJ             Tess and the Highlander
1    AVWFMN5CELC8Q             Tess and the Highlander
2   A37XYM3KSEIDLS             Tess and the Highlander
3   A1IQK80SXVPAWW             Tess and the Highlander
4   A1PI0GN5N7M3GF             Tess and the Highlander
5   A1H72DULSEGDX7             Tess and the Highlander
6   A1PQJ6IPEQB3CD             Tess and the Highlander
7   A3MOFUL6BIMK5A             Tess and the Highlander
8   A1C9QFE7N8YCF4             Tess and the Highlander
9    ANQHSC4OSF4SA             Tess and the Highlander
10  A1BXZTM5IFHODP             Tess and the Highlander
11  A148K8LOPWE6RK             Tess and the Highlander
12  A1NZ6D6SD7B1AP             Tess and the Highlander
13  A3FTN134G68ZJ4             Tess and the Highlander
14  A1QY2X792MF802             Tess and the Highlander
15  A1DJMMJ4OQ2712             Tess and the Highlander
16   ADB0JID2XRFYR  Night World: Daughters Of Darkness
17  A1V0SF

In [20]:
user_book_df.shape

(91013, 2)

## some basic testing

In [21]:
from itertools import combinations
from collections import Counter

In [22]:
# Create a dictionary where keys = user_ids and values = lists of book_ids they reviewed
user_books = user_book_df.groupby('User_id')['Title'].apply(list)
user_books.head()

User_id
A00290423P2GEY37XWVAW                [The Adventures of Super Diaper Baby]
A00538832OF17R8Q8JHTB              [Harry Potter and The Sorcerer's Stone]
A00540411RKGTDNU543WS                                         [The Hobbit]
A0056746VLX0I5VJCFN6     [If You Give a Mouse a Cookie (Book and Audio ...
A00878773S2MNB00COHKV                                         [The Hobbit]
Name: Title, dtype: object

In [23]:
# show the user with the most reviews and the number of reviews
user_books.apply(len).idxmax(), user_books.apply(len).max()

('A14OJS0VWMOSWO', np.int64(233))

In [24]:
# show the list of books reviewed by the user with the most reviews
user_books[user_books.apply(len).idxmax()]

["The National Review Treasury of Classic Children's Literature",
 'Casey at the Bat',
 'Summer of My German Soldier',
 'The witch of blackbird pond (A Dell yearling book)',
 'The Burgess bird book for children',
 'Seaman: The Dog Who Explored the West With Lewis and Clark (Peachtree Junior Publication)',
 'Mio, My Son',
 'The Moon in My Room (Willowbe Woods Campfire Stories)',
 'The Not Me Monster (Willowbe Woods Campfire Stories, 2)',
 'The legend of Sleepy Hollow,',
 'Pig on a Swing',
 'The Night Before Christmas, The (Wee Books for Wee Folk)',
 'Snack Attack',
 'The Golden Egg Book (Big Little Golden Book)',
 'Four Things My Geeky-Jock-of-a-Best Friend Must Do in Europe',
 'Boswell wide-awake',
 'How I Spent My Summer Holidays',
 'Marky and the Seagull',
 'Peter Raven: Under Fire (SIGNED)',
 'Cuba 15',
 "Saffy's Angel (Costa Children's Book Award (Awards))",
 'The Shy Stegosaurus of Cricket Creek',
 'Revenge on Rairarubia',
 'Owen & Mzee: The True Story of a Remarkable Friendship',

In [25]:
# Create book pairs for each user
book_pairs = []
for books in user_books:
    if len(books) > 1:  # Only consider users who reviewed at least 2 books
        book_pairs.extend(combinations(sorted(books), 2))

# Count the occurrence of each book pair
pair_counts = Counter(book_pairs)

# Convert to a DataFrame
book_pair_df = pd.DataFrame(pair_counts.items(), columns=['book_pair', 'count'])

# Sort by most frequently reviewed together
book_pair_df = book_pair_df.sort_values(by='count', ascending=False)

# Display the top 10 most reviewed-together books
book_pair_df.head(10)

Unnamed: 0,book_pair,count
35,"(Harry Potter and The Sorcerer's Stone, Harry ...",610
26,"(The Monster At The End Of This Book, The mons...",359
14,"(A visit from Santa Claus, The Night Before Ch...",327
0,"(The Velveteen Rabbit, Or, How Toys Become Rea...",240
3,"(The witch of Blackbird Pond (A Dell book), Th...",232
1,"(The Witch of Blackbird Pond, The witch of Bla...",230
2,"(The Witch of Blackbird Pond, The witch of bla...",230
823,"(Blueberries for Sal, Blueberries for Sal (Chi...",114
182,"(Mossflower, Mossflower (Redwall, Book 2))",106
15,"(Little House on the Prairie, Little House on ...",99


In [26]:
book_pair_df.shape

(134564, 2)

## Apply Association Rule Mining (Apriori or FP-Growth)

Convert Data into a Transaction Format
The Apriori algorithm requires data in a basket format, where each row represents a user’s basket of reviewed books.

In [27]:
# %pip install mlxtend


In [28]:
# Convert user-book interactions into a list of transactions
transactions = user_book_df.groupby("User_id")["Title"].apply(list).tolist()
transactions[:5]

[['The Adventures of Super Diaper Baby'],
 ["Harry Potter and The Sorcerer's Stone"],
 ['The Hobbit'],
 ['If You Give a Mouse a Cookie (Book and Audio CD) (Paperback)'],
 ['The Hobbit']]

In [29]:
# Encode transactions into a binary format
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Display a sample (binary matrix)
df_encoded.head()

Unnamed: 0,"""Could Be Worse!"" (Reading Rainbow Library)","""I'm Not Cute!""","""Miss Lou.""","""National Velvet""","""The Best Mistake Ever and Other Stories","""Thy servant a dog""",'Morning Has Broken',.Hack//Legend of the Twilight Vol. 3,1 2 3,"1, 2, I Love You",...,taran wanderer,the day the teacher went bananas,the fabulous flight,the golden key,the lion's paw,the little riders,the secret hide-out,tom swift in the city of gold,toning the sweep,ttyl
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
df_encoded.shape

(72391, 5737)

Each row is a user, each column is a book, 1 = reviewed, 0 = not reviewed

## Apply the Apriori Algorithm

Now, let’s find frequent book sets (books that users tend to review together).

If I set min_support=0.001, that means only book pairs occurring in at least 0,1% of transactions are considered.

In [32]:
from mlxtend.frequent_patterns import fpgrowth

In [33]:
frequent_itemsets = fpgrowth(df_encoded, min_support=0.0001, use_colnames=True)

In [34]:
# Find frequent itemsets with minimum support of 0.005
#frequent_itemsets = apriori(df_encoded, min_support=0.0001, use_colnames=True)

# Sort by support (higher support means more frequent co-occurrence)
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)

# Display top 10 frequent itemsets
print(frequent_itemsets.head(10))

      support                                           itemsets
1    0.050600            (Harry Potter and The Sorcerer's Stone)
2    0.049398                                       (The Hobbit)
4    0.025901                                        (The Giver)
14   0.021798                                            (Night)
115  0.021315          (Harry Potter and the Chamber of Secrets)
146  0.017571                     (Eldest (Inheritance, Book 2))
16   0.017046                                            (Holes)
98   0.011396                                   (Goodnight Moon)
42   0.011314  (Me Talk Pretty One Day (Turtleback School & L...
7    0.009766                                (The Secret Garden)


interpretation: 
- 4.9% of users reviewed Harry Potter and The Sorcerer's Stone
- 4.8% reviewed The Hobbit
- 2.5% reviewed The Giver
- 2.1% reviewed Night

## Generate Association Rules

To make meaningful recommendations, let’s generate association rules from the frequent itemsets.

In [35]:
# Generate association rules with minimum confidence of 0.1
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

# Sort by confidence (higher confidence means stronger rule)
rules = rules.sort_values(by="confidence", ascending=False)

# Display top 10 association rules
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
8,(The witch of Blackbird Pond (A Dell book)),(The witch of blackbird pond (A Dell yearling ...,0.003205,1.0,312.030172
9,(The witch of blackbird pond (A Dell yearling ...,(The witch of Blackbird Pond (A Dell book)),0.003205,1.0,312.030172
11,(The Witch of Blackbird Pond),(The witch of blackbird pond (A Dell yearling ...,0.003177,1.0,312.030172
13,(The Witch of Blackbird Pond),(The witch of Blackbird Pond (A Dell book)),0.003177,1.0,312.030172
15,"(The witch of Blackbird Pond (A Dell book), Th...",(The witch of blackbird pond (A Dell yearling ...,0.003177,1.0,312.030172
3,(The Monster At The End Of This Book),(The monster at the end of this book (A little...,0.004959,1.0,199.424242
4,(A visit from Santa Claus),"(The Night Before Christmas, The (Wee Books fo...",0.004517,1.0,220.704268
252,(The monster at the end of this book (A little...,(The Monster At The End Of This Book),0.000124,1.0,201.64624
253,"(Where The Wild Things Are, The Monster At The...",(The monster at the end of this book (A little...,0.000124,1.0,199.424242
257,"(By the Shores of Silver Lake, Little House on...",(Little House on the Prairie),0.000124,1.0,731.222222
