In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install mlxtend
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
# çıktının tek bir satırda olmasını sağlar.
pd.set_option('display.expand_frame_repr', False)
from mlxtend.frequent_patterns import apriori, association_rules

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/online-retail/Online_Retail.xlsx
/kaggle/input/online-retail/Online_Retail.csv


In [None]:
# Data pre-prcessing

def missing_values_analysis(data):
    na_columns = [col for col in data.columns if data[col].isnull().sum() > 0]
    n_miss = data[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (data[na_columns].isnull().sum() / data.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_df(data, row_num=5):
    print("*************** Dataset Shape ***************")
    print("No. of Rows:", data.shape[0], "\nNo. of Columns:", data.shape[1])
    print("*************** Dataset Information ***************")
    print(data.info())
    print("*************** Types of Columns ***************")
    print(data.dtypes)
    print(f"*************** First {row_num} Rows ***************")
    print(data.head(row_num))
    print(f"*************** Last {row_num} Rows ***************")
    print(data.tail(row_num))
    print("*************** Summary Statistics of The Dataset ***************")
    print(data.describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T)
    print("*************** Dataset Missing Values Analysis ***************")
    print(missing_values_analysis(data))




df_ = pd.read_csv("/kaggle/input/online-retail/Online_Retail.csv")
df = df_.copy()
check_df(df)

*************** Dataset Shape ***************
No. of Rows: 541909 
No. of Columns: 8
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
None
*************** Types of Columns ***************
InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object
********

In [None]:
#Data cleaning

def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

def retail_data_prep(dataframe):
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~dataframe["InvoiceNo"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["UnitPrice"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "UnitPrice")
    return dataframe

df = retail_data_prep(df)
df.isnull().sum()
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,397884.0,11.830797,25.523078,1.0,2.0,6.0,12.0,298.5
UnitPrice,397884.0,2.893454,3.22709,0.001,1.25,1.95,3.75,37.06
CustomerID,397884.0,15294.423453,1713.14156,12346.0,13969.0,15159.0,16795.0,18287.0


In [None]:
# Preparing ARL Data Structures (Invoice - Product matrix)
df_fr = df[df['Country'] == "France"]

print(df_fr.groupby(['InvoiceNo', 'Description']).agg({"Quantity": "sum"}).head(20)) #overviewing the data

print(df_fr.groupby(['InvoiceNo', 'Description']).agg({"Quantity": "sum"}).unstack().iloc[0:5, 0:5]) #let's look how seems converting the rows to columns

print(df_fr.groupby(['InvoiceNo', 'Description']).agg({"Quantity": "sum"}).unstack().fillna(0).iloc[0:5, 0:5])# let's try to fill NaN datas with 0

print(df_fr.groupby(['InvoiceNo', 'StockCode']). \
    agg({"Quantity": "sum"}). \
    unstack(). \
    fillna(0). \
    applymap(lambda x: 1 if x > 0 else 0).iloc[0:5, 0:5]) # if quentity bigger then 0 lets fill 1 to make matrix like 0 vs 1

                                               Quantity
InvoiceNo Description                                  
536370     SET 2 TEA TOWELS I LOVE LONDON            24
          ALARM CLOCK BAKELIKE GREEN                 12
          ALARM CLOCK BAKELIKE PINK                  24
          ALARM CLOCK BAKELIKE RED                   24
          CHARLOTTE BAG DOLLY GIRL DESIGN            20
          CIRCUS PARADE LUNCH BOX                    24
          INFLATABLE POLITICAL GLOBE                 48
          LUNCH BOX I LOVE LONDON                    24
          MINI JIGSAW CIRCUS PARADE                  24
          MINI JIGSAW SPACEBOY                       24
          MINI PAINT SET VINTAGE                     36
          PANDA AND BUNNIES STICKER SHEET            12
          POSTAGE                                     3
          RED TOADSTOOL LED NIGHT LIGHT              24
          ROUND SNACK BOXES SET OF4 WOODLAND         24
          SET/2 RED RETROSPOT TEA TOWELS        

In [None]:
#let's make a function to apply all rows
def create_invoice_product_df(dataframe, id=False):
    if id:
        return dataframe.groupby(['InvoiceNo', "StockCode"])['Quantity'].sum().unstack().fillna(False). \
            applymap(lambda x: True if x > 0 else False)
    else:
        return dataframe.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0). \
            applymap(lambda x: True if x > 0 else False)

fr_inv_pro_df = create_invoice_product_df(df_fr, id=True)

def check_id(dataframe, stock_code):
    product_name = dataframe[dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
    print(product_name)


check_id(df_fr, '10002')
check_id(df_fr, '10120')

In [None]:
check_df(fr_inv_pro_df)

*************** Dataset Shape ***************
No. of Rows: 461 
No. of Columns: 1543
*************** Dataset Information ***************
<class 'pandas.core.frame.DataFrame'>
Index: 461 entries, 536370 to C581316
Columns: 1543 entries, 10002 to POST
dtypes: bool(1543)
memory usage: 698.3+ KB
None
*************** Types of Columns ***************
StockCode
10002     bool
10120     bool
10125     bool
10135     bool
11001     bool
          ... 
90201B    bool
90201C    bool
C2        bool
M         bool
POST      bool
Length: 1543, dtype: object
*************** First 5 Rows ***************
StockCode  10002  10120  10125  10135  11001  15036  15039  15044C  15056BL  15056N  15056P  15058A  15058B  15058C  16012  16048  16156L  16156S  16161P  16161U  16169E  16218  16219  16225  16236  16237  16238  16258A  17011F  17012A  17012B  17174  20615  20617  20658  20665  20668  20674  20675  20676  20677  20679  20681  20682  20684  20685  20686  20702  20704  20711  20712  20713  20717  20718 

In [None]:
# let's expose association rule
frequent_itemsets = apriori(fr_inv_pro_df,
                            min_support=0.01,
                            use_colnames=True)

frequent_itemsets.sort_values("support", ascending=False)

rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.01)
# antecedent is first item in basket, consequent is second or more items in basket
# antecedent support is probability of basket has first item, consequent support is prob. of basket has the second item
# support is the fraction of the total number of transactions in which the itemset occurs (transactions containing both X and Y items / total number of transactions)
# confidence is the conditional probability of occurrence of consequent given the antecedent (transactions containing both X and Y items / transactions containing X)
# lift is the rise in probability of having {Y} on the cart with the knowledge of {X} being present over the probability of having {Y} on the cart without any knowledge about presence of {X}
# lift = (confidence/fraction of transactions containig Y)
rules[(rules["support"]>0.05) & (rules["confidence"]>0.1) & (rules["lift"]>5)]

check_id(df_fr, '21086')

rules[(rules["support"]>0.05) & (rules["confidence"]>0.1) & (rules["lift"]>5)]. \
sort_values("confidence", ascending=False)

# let's define a function to automate association rule
def create_rules(dataframe, id=True, country="France"):
    dataframe = dataframe[dataframe['Country'] == country]
    dataframe = create_invoice_product_df(dataframe, id)
    frequent_itemsets = apriori(dataframe, min_support=0.01, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.01)
    return rules

df = df_.copy()

df = retail_data_prep(df)
rules = create_rules(df)

In [None]:
# let's make recommender function

def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]


print(arl_recommender(rules, '22492', 1))
print(arl_recommender(rules, '22492', 2))
print(arl_recommender(rules, '22492', 3))

['22554']
['22554', '22382']
['22554', '22382', '22382']
