In [1]:
import pandas as pd
import pyfpgrowth as fp

In [2]:
# read in item data
items = pd.read_csv("item_to_id.csv")

# read in transactions
transactions = pd.read_csv("purchase_history.csv")

In [3]:
# inspect items
items.head()

Unnamed: 0,Item_name,Item_id
0,coffee,43
1,tea,23
2,juice,38
3,soda,9
4,sandwich loaves,39


In [5]:
# how many unique items are there?
len(items.Item_name.unique())

48

In [4]:
# inspect transactions
transactions.head()

Unnamed: 0,user_id,id
0,222087,2726
1,1343649,64717
2,404134,1812232227433820351
3,1110200,923220264737
4,224107,"31,18,5,13,1,21,48,16,26,2,44,32,20,37,42,35,4..."


In [18]:
# split items purchased
transactions['id'] = transactions.id.str.split(',')

In [19]:
# inspect transactions after splitting id
transactions.head()

Unnamed: 0,user_id,id
0,222087,"[27, 26]"
1,1343649,"[6, 47, 17]"
2,404134,"[18, 12, 23, 22, 27, 43, 38, 20, 35, 1]"
3,1110200,"[9, 23, 2, 20, 26, 47, 37]"
4,224107,"[31, 18, 5, 13, 1, 21, 48, 16, 26, 2, 44, 32, ..."


In [20]:
# how many tranactions are there?
len(transactions)

39474

In [35]:
# how many times does a customer come back on average?
transactions.user_id.value_counts().describe()

count    24885.000000
mean         1.586257
std          0.822156
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          7.000000
Name: user_id, dtype: float64

In [22]:
# who are the top 10 customers that have come back the most
transactions.user_id.value_counts()[:10]

696000     7
31625      7
653800     7
884172     7
315516     6
453019     6
38872      6
779061     6
1374100    6
813270     6
Name: user_id, dtype: int64

In [34]:
# how many items are purchased on a typical order
transactions['num_id'] = transactions['id'].apply(lambda x: len(x))
transactions.num_id.describe()

count    39474.000000
mean         8.106475
std          4.058790
min          1.000000
25%          5.000000
50%          8.000000
75%         11.000000
max         27.000000
Name: num_id, dtype: float64

In [37]:
# list ids into a list of liss
ids = transactions.id.tolist()

In [40]:
# find patterns that occur over 2 times
patterns = fp.find_frequent_patterns(ids, 2)

In [58]:
# put patterns into a list 
pat = []
for i in patterns:
    pat.append((i, patterns[i], len(i)))

In [59]:
# put results into a df
pat_df = pd.DataFrame(pat, columns=['pattern', 'count', 'num_items'])

In [65]:
# sort by count
pat_df.sort_values('count', ascending=False, inplace=True)

In [82]:
# inspect the dataframe
pat_df.head(30)

Unnamed: 0,pattern,count,num_items
30385557,"(2,)",14473,1
49091876,"(1,)",9119,1
10405955,"(2, 42)",4501,2
19254272,"(2, 44)",4473,2
32442596,"(10, 2)",4450,2
16193105,"(2, 45)",4419,2
53480261,"(1, 2)",4106,2
56441726,"(2, 32)",3597,2
57971788,"(2, 20)",3562,2
53161452,"(2, 38)",3556,2


In [76]:
# what are these items?
items[items.Item_id.isin(pat_df.loc[10405955]['pattern'])]

Unnamed: 0,Item_name,Item_id
34,cucumbers,42
35,lettuce,2


In [77]:
# what are these items?
items[items.Item_id.isin(pat_df.loc[19254272]['pattern'])]

Unnamed: 0,Item_name,Item_id
31,broccoli,44
35,lettuce,2


In [84]:
# what are these items?
items[items.Item_id.isin(pat_df.loc[24631807]['pattern'])]

Unnamed: 0,Item_name,Item_id
28,cherries,25
29,grapefruit,20


In [None]:
# find patterns that are highly associated with one another
rules = fp.generate_association_rules(patterns, 0.7)