In [8]:
# libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori as ap, association_rules as ap_rl
import numpy as np
from icecream import ic
import os
from joblib import Parallel,delayed

In [210]:
# importing data

# NOTE: REPLACE PATHFILE TO DATA FOR EITHER GOAL 1 OR GOAL 2
# i.e. data has to focus on first few steps of journey or last few steps

# import
#export = pd.read_csv("C:/Users/lavil/source/repos/LukVill/Misc Data/export_no_dup.csv")

## change this filepath to take in the data, don't forget to remove `nrows` if you want the whole data

In [2]:
export = pd.read_csv("C:/Users/lavil/source/repos/LukVill/Misc Data/export_no_dup.csv")

In [4]:
# START PREP DATA FOR ALGO

# data manipulation to get frequencies of each event per customer/account pair

# select cols
export = export[["customer_id","account_id","ed_id"]]
# select customer/account/ed_id, count each observation
export["count"] = export.groupby(by = ["customer_id","account_id","ed_id"])["ed_id"].transform("count")
# groups are duplicated, so remove duplicates
export = export.drop_duplicates()

### Check on dataset with counts:

In [213]:
export.head()

Unnamed: 0,customer_id,account_id,ed_id,count
0,-784961211,1773350293,12,1
1,-784961211,1773350293,19,3
2,-784961211,1773350293,3,1
3,-784961211,1773350293,2,1
6,15849251,383997507,4,66


In [5]:
# pivot table
export = export.pivot(index = ["customer_id","account_id"], columns="ed_id", values="count").fillna(0)

### Check pivoted data:

In [215]:
export.head()

Unnamed: 0_level_0,ed_id,1,2,3,4,5,6,7,8,11,12,13,15,18,19,24,26,27,28,29
customer_id,account_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
-1697747935,-1016455199,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0
-1554036291,-1199609206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1539774003,53327807,4.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0
-1522106248,-693958153,3.0,0.0,1.0,51.0,0.0,0.0,0.0,0.0,12.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
-1257367741,1076891306,1.0,1.0,1.0,4.0,6.0,3.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,17.0,0.0,0.0,1.0,1.0,1.0


## If you want the frequency data, run the following code

In [216]:
# CHANGE THIS FOR YOURSELF: this is the dir you want to save CSV to
filepath = "C:/Users/lavil/source/repos/LukVill/Misc Data"

export.to_csv(filepath + "/export_freq.csv")

In [6]:
# change table vals to 
# if val > 1, then val = 1

# helper function to change val
def cond(x):
    if x >= 1:
        x = 1
    else:
        x = 0
    return x
v_cond = np.vectorize(cond)

export = export[export.columns].apply(v_cond)


### Check on data with edited values

In [218]:
export.head()

Unnamed: 0_level_0,ed_id,1,2,3,4,5,6,7,8,11,12,13,15,18,19,24,26,27,28,29
customer_id,account_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
-1697747935,-1016455199,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
-1554036291,-1199609206,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
-1539774003,53327807,1,1,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
-1522106248,-693958153,1,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0
-1257367741,1076891306,1,1,1,1,1,1,0,0,1,1,0,0,1,1,0,0,1,1,1


## RUN THE NEXT CODE TO SAVE THE DATASET

In [9]:
# SAVE OUTPUT


# FOLDER
save_dir = "C:/Users/lavil/source/repos/LukVill/Misc Data"

export.to_csv(save_dir + "/export_coded.csv")

KeyboardInterrupt: 

## Edit the following parameters to edit apriori algorithm!!!

In [219]:
# NOTE TODO:PARAMETERS OF APRIORI ALGORITHM

# support - % of how often each item appears
supp_thresh = 0.05

# confidence


# lift
lift_thresh = 0.8

# FILTER THRESHOLD FOR # OF ITEMS IN SET (threshold INCLUDES the given value)
n_item_thresh = 3

# sort by columns
filter_cols = ["lift"]

In [220]:
# APRIORI ALGORITHM

ap_set = ap(export, min_support=supp_thresh, use_colnames=True)
rules = ap_rl(ap_set, metric = "lift", min_threshold= lift_thresh)

# vectorize length func
v_len = np.vectorize(len)

# use it for condition in rule filtering
cond_1 = v_len(rules.antecedents.values) >= n_item_thresh
cond_2 = v_len(rules.consequents.values) >= n_item_thresh



KeyboardInterrupt: 

## Left item set results:

In [None]:
# filter rules 
rules[cond_1].sort_values(by = filter_cols, ascending = False).head(n = 10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1026351,"(8, 27, 4, 5)","(1, 3, 7, 19, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1838796,"(3, 5, 7, 12, 27, 28)","(1, 6, 8, 11, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
889106,"(8, 11, 5)","(4, 7, 12, 19, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1838794,"(3, 5, 7, 11, 27, 29)","(1, 6, 8, 12, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1476057,"(1, 3, 4, 7, 11, 12, 19, 29)","(8, 5)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
889107,"(8, 12, 5)","(4, 7, 11, 19, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
889108,"(8, 19, 5)","(4, 7, 11, 12, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1838793,"(3, 5, 7, 11, 27, 28)","(1, 6, 8, 12, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1551300,"(8, 1, 5, 6)","(3, 7, 12, 19, 27, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1093507,"(1, 6, 7, 11, 28)","(8, 29, 3, 5)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0


## Right itemset results:

In [None]:
rules[cond_2].sort_values(by = filter_cols, ascending=False).head(n = 10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1652369,"(4, 6, 8, 11, 19)","(3, 5, 7, 12, 27)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231315,"(8, 1, 27, 12)","(6, 7, 11, 28, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231309,"(1, 27, 29, 7)","(6, 8, 11, 12, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231310,"(1, 28, 29, 7)","(6, 8, 11, 12, 27)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231311,"(8, 1, 11, 12)","(6, 7, 27, 28, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231312,"(8, 1, 11, 27)","(6, 7, 12, 28, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231313,"(8, 1, 11, 28)","(6, 7, 12, 27, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231314,"(8, 1, 11, 29)","(6, 7, 12, 27, 28)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231316,"(8, 1, 12, 28)","(6, 7, 11, 27, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
1231340,"(27, 12, 6, 7)","(1, 8, 11, 28, 29)",0.085714,0.085714,0.085714,1.0,11.666667,0.078367,inf,1.0
