In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

# IMPORTANT: apyori and mlxtend libraries both have methods which are called apriori. 
# Be sure to import only one of the libraries in order to use apriori.

from apyori import apriori
# from mlxtend.frequent_patterns import apriori
# from mlxtend.frequent_patterns import association_rules

# Display numbers not in scientific notation:
pd.options.display.float_format = '{:.6f}'.format

# Load data

In [40]:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})
df_trans.tail()


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1
31788323,2020-09-22,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,898573003,0.033881,2


In [3]:
# Train test split:
df_trans_train = df_trans.query('t_dat < "2020-09-16"').copy()
df_trans_train.tail()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31548008,2020-09-15,ffe41634ff990908faacbb465063e027e7c39499f8dfc1...,850917001,0.025407,1
31548009,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,853316001,0.008458,1
31548010,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,296366006,0.000847,1
31548011,2020-09-15,ffef8aec5cf011fa1393b40337a5993ce0b7b81af6b322...,789769001,0.013542,1
31548012,2020-09-15,fff5bd112051feb2367276df143f79bc69126814c73e21...,728156001,0.043203,1


# 1.0 Model on wardrobes without 'None'

https://www.analyticsvidhya.com/blog/2021/10/a-comprehensive-guide-on-market-basket-analysis/

Create association rules based on wardrobes (not on baskets). 

Use apyori library.

In [5]:
# Generate wardrobe

df_trans_red = df_trans_train.drop(columns=['t_dat', 'price', 'sales_channel_id']).copy()
df_wardrobe = df_trans_red.groupby('customer_id')['article_id'].aggregate(lambda x: list(x)).reset_index().copy()


In [7]:
# Clear memory
del [[df_trans,df_trans_red, df_trans_train]]

NameError: name 'df_trans' is not defined

In [8]:
df_wardrobe.head()

Unnamed: 0,customer_id,article_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0625548001, 0176209023, 0627759010, 069713800..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0583558001, 0639677008, 0640244003, 052126900..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400..."


In [9]:
# Add 'number of articles' column (needed to generate smaller dataset):

df_wardrobe['no_articles'] = df_wardrobe.article_id.apply(lambda x: len(x))


In [10]:
df_wardrobe.sort_values('no_articles', ascending=False)

Unnamed: 0,customer_id,article_id,no_articles
1007560,be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee9...,"[0658506001, 0662980002, 0667709001, 068568700...",1895
958450,b4db5e5259234574edfff958e170fe3a5e13b6f146752c...,"[0673186001, 0717205001, 0669713004, 051413400...",1427
390238,49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05...,"[0568597012, 0588689005, 0573716033, 065578400...",1356
881514,a65f77281a528bf5c1e9f270141d601d116e1df33bf9df...,"[0543729003, 0610016001, 0639199001, 057365000...",1355
1086478,cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed...,"[0671783004, 0711547001, 0631270001, 071187100...",1223
...,...,...,...
828773,9c7759773428ecaa1a50a5ecd75c87a83be8e4f997b9a6...,[0708274001],1
828771,9c773f1af4c1c25462987ba32feb295a6ff177a8785648...,[0685417006],1
828766,9c76e78d5ab678ef3fdf04bb7618df5a8c02a84c926077...,[0822022003],1
828765,9c76e2837e1e65e279adde5d754e123c811a2253f6fe14...,[0825811004],1


In [11]:
# Select only wardrobes with 2 - 39 articles:

df_wardrobe_small = df_wardrobe[(df_wardrobe['no_articles'] > 1) & (df_wardrobe['no_articles'] < 21)]
df_wardrobe_small.shape

(805100, 3)

In [12]:
df_wardrobe_small.head()

Unnamed: 0,customer_id,article_id,no_articles
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0663713001, 0541518023, 0663713001, 057802000...",18
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0742079001, 0732413001]",2
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0634249005, 0677049001, 0698286003, 070770400...",13
5,000064249685c11552da43ef22a5030f35a147f723d5b0...,"[0738133005, 0680265002, 0740962001]",3
6,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,"[0735843004, 0726925001, 0715624008, 078338800...",6


In [13]:
# Create list of lists with all wardrobes (wardrobesizes 2-20):
l_new = df_wardrobe_small['article_id'].to_list()

In [14]:
# Clear memory
del [[df_wardrobe, df_wardrobe_small]]

In [17]:
gc.collect()

1047

In [18]:
l_new

[['0663713001',
  '0541518023',
  '0663713001',
  '0578020002',
  '0723529001',
  '0351484002',
  '0351484002',
  '0727808001',
  '0727808007',
  '0858883002',
  '0851400006',
  '0750424014',
  '0750424014',
  '0870304002',
  '0870304002',
  '0852643001',
  '0852643003',
  '0794321007'],
 ['0742079001', '0732413001'],
 ['0634249005',
  '0677049001',
  '0698286003',
  '0707704003',
  '0399061015',
  '0399061015',
  '0589440005',
  '0827971001',
  '0818320001',
  '0896152002',
  '0730683050',
  '0927530004',
  '0791587015'],
 ['0738133005', '0680265002', '0740962001'],
 ['0735843004',
  '0726925001',
  '0715624008',
  '0783388001',
  '0719530003',
  '0448509014'],
 ['0819423001', '0850614001'],
 ['0673677001', '0551080020', '0648414023', '0673677004'],
 ['0770321002', '0760084003', '0760084013'],
 ['0308154005',
  '0624690002',
  '0548110004',
  '0643215002',
  '0400456001',
  '0453239043',
  '0644007001',
  '0565200013',
  '0565200029',
  '0636475003',
  '0624121005',
  '0565200030',
  

Resource:
https://stackoverflow.com/questions/35491274/split-a-pandas-column-of-lists-into-multiple-columns


In [None]:
# NOT NEEDED ANYMORE
# # Generate columns where every article_id is in one column:
# df_only_articles = pd.DataFrame(df_wardrobe_small['article_id'].to_list(), index=df_wardrobe_small.index)

# # Join dataframe with only articles to wardrobe on index:

# df_wardrobe_small = df_wardrobe_small.join(df_only_articles)

# # Drop not needed columns:

# df_wardrobe_small_red = df_wardrobe_small.drop(columns=['customer_id', 'article_id', 'no_articles'])
# df_wardrobe_small_red


In [None]:
# NOT NEEDED ANYMORE
# Converting dataframe into list of lists:

# l=[]

# for i in range (0, len(df_wardrobe_small_red)):
#     l.append([str(df_wardrobe_small_red.values[i,j]) for j in range(0,df_wardrobe_small_red.shape[1])])

# l

In [None]:
# NOT NEEDED ANYMORE
# # Delete 'None'-strings from all lists:

# l_new = []
# for i in l:
#     l_new.append([ ele for ele in i if ele != 'None' ])

In [19]:
# Applying apriori algorithm

association_rules = apriori(l_new, min_support=0.0001, min_confidence=0.2, min_lift=1, min_length=2)
association_results = list(association_rules)

In [28]:
# Print association rules:

for i in range(0, len(association_results)):
    print(f"Rule = {association_results[i][0]} , len of rule= {len(association_results[i][0])}")

Rule = frozenset({'0108775015', '0108775044'}) , len of rule= 2
Rule = frozenset({'0700910001', '0111565001'}) , len of rule= 2
Rule = frozenset({'0158340001', '0111586001'}) , len of rule= 2
Rule = frozenset({'0234432001', '0111586001'}) , len of rule= 2
Rule = frozenset({'0417951005', '0111586001'}) , len of rule= 2
Rule = frozenset({'0111593001', '0234432001'}) , len of rule= 2
Rule = frozenset({'0111593001', '0240561001'}) , len of rule= 2
Rule = frozenset({'0120129001', '0120129014'}) , len of rule= 2
Rule = frozenset({'0156231001', '0156231002'}) , len of rule= 2
Rule = frozenset({'0160442007', '0160442010'}) , len of rule= 2
Rule = frozenset({'0160442007', '0160442043'}) , len of rule= 2
Rule = frozenset({'0160442010', '0160442043'}) , len of rule= 2
Rule = frozenset({'0324946001', '0179208001'}) , len of rule= 2
Rule = frozenset({'0189616006', '0189616008'}) , len of rule= 2
Rule = frozenset({'0201219003', '0201219012'}) , len of rule= 2
Rule = frozenset({'0201219011', '0201219

In [21]:
len(association_results)

779

In [39]:
association_results[500]

RelationRecord(items=frozenset({'0742925003', '0742924003'}), support=0.00033784623028195257, ordered_statistics=[OrderedStatistic(items_base=frozenset({'0742924003'}), items_add=frozenset({'0742925003'}), confidence=0.5811965811965812, lift=1021.6623745008025), OrderedStatistic(items_base=frozenset({'0742925003'}), items_add=frozenset({'0742924003'}), confidence=0.5938864628820961, lift=1021.6623745008027)])

In [22]:
association_results_df_plain = pd.DataFrame(association_results)
association_results_df_plain.to_csv('../data/20220504_plain_association_results_wardrobesize_2-20_wo_none.csv')

In [23]:
association_results_dict = {'antecedants': [], 'consequents':[], 'support': [], 'confidence': [], 'lift':[]}

for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])
    association_results_dict['antecedants'].append(items[0])
    association_results_dict['consequents'].append(items[1])
    # second index of the inner list
    print("Support: " + str(item[1]))
    association_results_dict['support'].append(item[1])
    # third index of the list located at 0th position
    # of the third index of the inner list
    print("Confidence: " + str(item[2][0][2]))
    association_results_dict['confidence'].append(item[2][0][2])
    print("Lift: " + str(item[2][0][3]))
    association_results_dict['lift'].append(item[2][0][3])
    print("-----------------------------------------------------")

Rule: 0108775015 -> 0108775044
Support: 0.0004372127686001739
Confidence: 0.23529411764705882
Lift: 166.31720291277176
-----------------------------------------------------
Rule: 0700910001 -> 0111565001
Support: 0.00013166066327164328
Confidence: 0.3897058823529412
Lift: 367.8220467553962
-----------------------------------------------------
Rule: 0158340001 -> 0111586001
Support: 0.00040119239845981866
Confidence: 0.20678617157490398
Lift: 95.51551734650326
-----------------------------------------------------
Rule: 0234432001 -> 0111586001
Support: 0.0001341448267295988
Confidence: 0.23736263736263735
Lift: 122.34357192103671
-----------------------------------------------------
Rule: 0417951005 -> 0111586001
Support: 0.00013290274500062104
Confidence: 0.22384937238493724
Lift: 115.378444114669
-----------------------------------------------------
Rule: 0111593001 -> 0234432001
Support: 0.00012048192771084337
Confidence: 0.21318681318681315
Lift: 109.32274095331418
-----------------

In [24]:
# Store association rules dict in dataframe:
association_results_df_table = pd.DataFrame.from_dict(association_results_dict)
association_results_df_table.head()

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,108775015,108775044,0.000437,0.235294,166.317203
1,700910001,111565001,0.000132,0.389706,367.822047
2,158340001,111586001,0.000401,0.206786,95.515517
3,234432001,111586001,0.000134,0.237363,122.343572
4,417951005,111586001,0.000133,0.223849,115.378444


In [25]:
association_results_df_table.shape

(779, 5)

In [29]:
# Store association results in csv.
# Change name before executing cell:

association_results_df_table.to_csv('../data/20220504_table_association_results_wardrobesize_2-20_wo_none.csv')