# Import Libraries 

In [1]:
import numpy as np # linear algebra
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import seaborn as sns

# EDA

In [6]:
dataset = pd.read_csv('Learning portfolio.csv')

In [7]:
dataset.columns

Index(['Learner ', 'Azure', 'Automate Anywhere', 'AWS', 'Google Cloud',
       'GitHub ', 'Git', 'Statistics', 'Cloud Overview', 'Cloud Application',
       'How to Learn', 'Python Begineer', 'Python Medium ', 'Python Advanced',
       'Hacker Mindset', 'Equities Trading', 'Equities Risk ',
       'Equities Pricing', 'Currency Trading', 'Currency Risk',
       'Currency Pricing', 'Options', 'Futures', 'Portfolio Management',
       'Listed Derivatives', 'Electronic Trading Overview',
       'Electronic Trading History', 'FIX understodd', 'API Overview',
       'API Build', 'API Use ', 'Javascript Begineer', 'Javascript Medium',
       'Javascript Advanced', 'Regulatory Risk', 'MiFID II ',
       'Volatility Index', 'Thematic Research', 'Hedge Funds: Fundamental',
       'Hedge Funds: Credit', 'Wealth Management Principles',
       'How banks make money', 'Ruthless Efficiency',
       'Generational Wealth Management', 'Digitizing Processes ',
       'Stakeholder Management ', 'Time Mana

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 54 columns):
Learner                           54 non-null object
Azure                             54 non-null bool
Automate Anywhere                 54 non-null bool
AWS                               54 non-null bool
Google Cloud                      54 non-null bool
GitHub                            54 non-null bool
Git                               54 non-null bool
Statistics                        54 non-null bool
Cloud Overview                    54 non-null bool
Cloud Application                 54 non-null bool
How to Learn                      54 non-null bool
Python Begineer                   54 non-null bool
Python Medium                     54 non-null bool
Python Advanced                   54 non-null bool
Hacker Mindset                    54 non-null bool
Equities Trading                  54 non-null bool
Equities Risk                     54 non-null bool
Equities Pricing            

# Data Transformation 

In [9]:
dataset.drop(columns=['Learner '], inplace=True)

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 53 columns):
Azure                             54 non-null bool
Automate Anywhere                 54 non-null bool
AWS                               54 non-null bool
Google Cloud                      54 non-null bool
GitHub                            54 non-null bool
Git                               54 non-null bool
Statistics                        54 non-null bool
Cloud Overview                    54 non-null bool
Cloud Application                 54 non-null bool
How to Learn                      54 non-null bool
Python Begineer                   54 non-null bool
Python Medium                     54 non-null bool
Python Advanced                   54 non-null bool
Hacker Mindset                    54 non-null bool
Equities Trading                  54 non-null bool
Equities Risk                     54 non-null bool
Equities Pricing                  54 non-null bool
Currency Trading              

# Create Frequent item list

In [11]:
frequent_itemsets = apriori(dataset, min_support=0.2, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets = frequent_itemsets[ (frequent_itemsets['length'] == 2)]
frequent_itemsets

Unnamed: 0,support,itemsets,length
53,0.222222,"(Azure, Automate Anywhere)",2
54,0.277778,"(AWS, Azure)",2
55,0.203704,"(Azure, Google Cloud)",2
56,0.277778,"(Azure, GitHub )",2
57,0.222222,"(Azure, Cloud Overview)",2
58,0.277778,"(Azure, Cloud Application)",2
59,0.277778,"(How to Learn, Azure)",2
60,0.240741,"(Azure, Python Begineer)",2
61,0.203704,"(Azure, Python Medium )",2
62,0.203704,"(Azure, Python Advanced)",2


# Create Frequent item list 

In [12]:
frequent_itemsets = apriori(dataset, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules[ (rules['antecedent_len'] >= 2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
2725,"(AWS, Azure)",(Automate Anywhere),0.277778,0.462963,0.129630,0.466667,1.008000,0.001029,1.006944,2
2726,"(AWS, Automate Anywhere)",(Azure),0.203704,0.444444,0.129630,0.636364,1.431818,0.039095,1.527778,2
2727,"(Azure, Automate Anywhere)",(AWS),0.222222,0.518519,0.129630,0.583333,1.125000,0.014403,1.155556,2
2728,"(Azure, Automate Anywhere)",(GitHub ),0.222222,0.481481,0.148148,0.666667,1.384615,0.041152,1.555556,2
2729,"(Azure, GitHub )",(Automate Anywhere),0.277778,0.462963,0.148148,0.533333,1.152000,0.019547,1.150794,2
2730,"(Automate Anywhere, GitHub )",(Azure),0.240741,0.444444,0.148148,0.615385,1.384615,0.041152,1.444444,2
2734,"(Azure, Automate Anywhere)",(Cloud Overview),0.222222,0.592593,0.129630,0.583333,0.984375,-0.002058,0.977778,2
2735,"(Azure, Cloud Overview)",(Automate Anywhere),0.222222,0.462963,0.129630,0.583333,1.260000,0.026749,1.288889,2
2736,"(Automate Anywhere, Cloud Overview)",(Azure),0.314815,0.444444,0.129630,0.411765,0.926471,-0.010288,0.944444,2
2737,"(Azure, Automate Anywhere)",(Cloud Application),0.222222,0.574074,0.148148,0.666667,1.161290,0.020576,1.277778,2


# Note: It might look more complicated, but the important things are

antecedents stands for "when people buy these products

consequents stands for "they are likely to buy this, too

support confidence is how often a rule is found to be true

Lift If the rule had a lift of 1, it would imply that the probability of occurrence, and that of the consequent are independent of each other.

If the lift is > 1, that lets us know the degree to which those two occurrences are dependent on one another, and makes those rules potentially useful for predicting the consequent in future data sets.

If the lift is < 1, that lets us know the items are mutually substitutable. This means that presence of one item has negative effect on presence of other item and vice versa.

antecedent_len stands for the number of items with the antecedent item set

In [9]:
frequent_itemsets = apriori(dataset, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules[ (rules['antecedent_len'] >= 2) &
       (rules['lift'] > 2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
15326,"(Portfolio Management, Currency Risk)","(Azure, DevOps understood)",0.351852,0.277778,0.203704,0.578947,2.084211,0.105967,1.715278,2
15331,"(Azure, DevOps understood)","(Portfolio Management, Currency Risk)",0.277778,0.351852,0.203704,0.733333,2.084211,0.105967,2.430556,2
15401,"(Generational Wealth Management, Cloud Overvie...",(Automate Anywhere),0.203704,0.462963,0.203704,1.000000,2.160000,0.109396,inf,3
15406,"(Generational Wealth Management, Portfolio Man...","(Cloud Overview, Automate Anywhere)",0.259259,0.314815,0.203704,0.785714,2.495798,0.122085,3.197531,2
15409,"(Cloud Overview, Automate Anywhere)","(Generational Wealth Management, Portfolio Man...",0.314815,0.259259,0.203704,0.647059,2.495798,0.122085,2.098765,2
15430,"(FIX understodd, Generational Wealth Management)","(Cloud Overview, Automate Anywhere)",0.314815,0.314815,0.203704,0.647059,2.055363,0.104595,1.941358,2
15431,"(Cloud Overview, Automate Anywhere)","(FIX understodd, Generational Wealth Management)",0.314815,0.314815,0.203704,0.647059,2.055363,0.104595,1.941358,2
15447,"(How banks make money, Cloud Overview, Automat...",(Generational Wealth Management),0.203704,0.481481,0.203704,1.000000,2.076923,0.105624,inf,3
15452,"(How banks make money, Automate Anywhere)","(Generational Wealth Management, Cloud Overview)",0.259259,0.351852,0.203704,0.785714,2.233083,0.112483,3.024691,2
15453,"(How banks make money, Generational Wealth Man...","(Cloud Overview, Automate Anywhere)",0.296296,0.314815,0.203704,0.687500,2.183824,0.110425,2.192593,2


In [10]:
rules.sort_values('confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
18597,"(MiFID II , Portfolio Management, Currency Risk)",(DevOps understood),0.203704,0.648148,0.203704,1.0,1.542857,0.071674,inf,3
18266,"(Futures, Equities Trading, DevOps understood)",(Currency Risk),0.203704,0.574074,0.203704,1.0,1.741935,0.086763,inf,3
18121,"(API Use , Hacker Mindset, Time Management)",(Portfolio Management),0.222222,0.648148,0.222222,1.0,1.542857,0.078189,inf,3
2512,"(Azure, DevOps understood)",(Portfolio Management),0.277778,0.648148,0.277778,1.0,1.542857,0.097737,inf,2
17487,"(Time Management, Currency Risk, Python Begineer)",(Portfolio Management),0.203704,0.648148,0.203704,1.0,1.542857,0.071674,inf,3


In [11]:
# calculate average lift of entire dataset

# extact lift column from rules dataframe
# NOTE The column is wanted in series form to be passed to numpy. Extracting as a dataframe would be rules.loc[:, ["lift"]]
lift_series = rules.loc[:, "lift"]

# perform averaging with np.mean function
average_lift = np.mean(lift_series)

# NOTE this is an introduction into fstrings, current best practice for formatting strings
# additional resources can be found at: https://realpython.com/python-f-strings/
print(f"The average lift of the rules dataframe is: {average_lift}")

The average lift of the rules dataframe is: 1.2411917996788322
