<a href="https://colab.research.google.com/github/AnnaK8090/CIND-820_Big-Data-Analytics-Project/blob/main/CIND_820_Big_Data_Analytics_Project_ASSOCIATION_RULES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



Finding association rules between a set of co-purchased items is one of the most commonly used data mining techniques for ecommerce.
To find frequent item-sets, an algorithm
known as “Apriori” is used. Apriori is the most classic and widely used
algorithm from which many variants have been
developed [20-23]. For each item in frequent item-sets,
we call each other an associate item. 


In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# installing the apyori package
!pip install apyori

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
customersDF = pd.read_csv("olist_customers_dataset.csv")
geolocationDF = pd.read_csv("olist_geolocation_dataset.csv")
order_itemsDF = pd.read_csv("olist_order_items_dataset.csv")
order_paymentsDF = pd.read_csv("olist_order_payments_dataset.csv")
order_reviewsDF = pd.read_csv("olist_order_reviews_dataset.csv")
ordersDF = pd.read_csv("olist_orders_dataset.csv")
productsDF = pd.read_csv("olist_products_dataset.csv")
sellersDF = pd.read_csv("olist_sellers_dataset.csv")
product_category_name_translation = pd.read_csv("product_category_name_translation.csv")

In [None]:
masterDF = ordersDF.copy()
masterDF = masterDF.merge(customersDF,on='customer_id')
masterDF = masterDF.merge(order_reviewsDF,on='order_id')
masterDF = masterDF.merge(order_paymentsDF,on='order_id')
masterDF = masterDF.merge(order_itemsDF,on='order_id')
masterDF = masterDF.merge(productsDF,on='product_id')
masterDF = masterDF.merge(sellersDF,on='seller_id')
masterDF = masterDF.merge(product_category_name_translation,on='product_category_name')


In [None]:
masterDF.head()

In [None]:
masterDF.shape

(115609, 40)

In [None]:
s = masterDF.groupby('order_id').product_category_name_english.nunique()
masterDF = masterDF[masterDF['order_id'].isin(s.index[s>1])]
masterDF.shape

(1943, 40)

In [None]:
masterDF.shape

(1943, 40)

In [None]:
masterDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1943 entries, 53 to 115599
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       1943 non-null   object 
 1   customer_id                    1943 non-null   object 
 2   order_status                   1943 non-null   object 
 3   order_purchase_timestamp       1943 non-null   object 
 4   order_approved_at              1943 non-null   object 
 5   order_delivered_carrier_date   1943 non-null   object 
 6   order_delivered_customer_date  1937 non-null   object 
 7   order_estimated_delivery_date  1943 non-null   object 
 8   customer_unique_id             1943 non-null   object 
 9   customer_zip_code_prefix       1943 non-null   int64  
 10  customer_city                  1943 non-null   object 
 11  customer_state                 1943 non-null   object 
 12  review_id                      1943 non-null 

In [None]:
masterDF["ItemID_NEW_StringType"] = masterDF["product_id"]

In [None]:
masterDF["ItemID_NEW_StringType"] =masterDF["ItemID_NEW_StringType"].astype(str)

In [None]:
masterDF['ItemID_Category'] = [''.join(i) for i in zip(masterDF['ItemID_NEW_StringType'], masterDF['product_category_name_english'])]

In [None]:
masterDF["Quantity"]=1

In [None]:
# Stripping extra spaces in the description
masterDF['ItemID_Category'] = masterDF['ItemID_Category'].str.strip()
  
# Dropping the rows without any invoice number
masterDF.dropna(axis = 0, subset =['order_id'], inplace = True)
masterDF['order_id'] = masterDF['order_id'].astype('str')
  

In [None]:
masterDF_groups =masterDF.groupby('product_category_name_english')
masterDF_groups.groups    


In [None]:
masterDF_groups.head()

In [None]:
masterDF_groups.groups

In [None]:
#options = ["bed_bath_table","baby","furniture_decor","pet_shop","sports_leisure","auto","fashion_bags_accessories"]

In [None]:
#masterDF = masterDF[masterDF['product_category_name_english'].isin(options)]


In [None]:
basket_bed_bath_table = (masterDF
          .groupby(['order_id', 'ItemID_Category'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('order_id'))


In [None]:
basket_bed_bath_table.head()

ItemID_Category,0042f1a9a7e0edd1400c6cd0fda065f8health_beauty,007c63ae4b346920756b5adcad8095dehousewares,00ba6d766f0b1d7b78a5ce3e1e033263housewares,011377a7487fef47fc9e73fa5f7322a6baby,014a8a503291921f7b004a5215bb3c36baby,01b660ebc1a0c293ccf9b117fa6dd8bbhousewares,01c2e91674406ebaca6a1bbf7f61c3f0computers_accessories,01cf7c4cffff8db0a1cbe612bd2d50a4furniture_decor,01e20e6604216c8adb31d463214ba00chome_construction,01fc56750f0d3444c4a1746ecf19dee0sports_leisure,...,fe9c4b2cf9c3adbaf3644880fb72254dfurniture_decor,fe9dfbe7f974621789683b7b78be2a16health_beauty,feb4ade62e32b8d74c6f69f635057964furniture_living_room,fec2d939a171210847d8f2d102f0dba5garden_tools,fec3b45dc09b257690a09a742870b149garden_tools,fef0296e6442db59700c92c692c90e90costruction_tools_garden,fef7934cc233ee5b1dc13094d98a1465cool_stuff,ff7263dfb3cfff5421ada48c3899c313construction_tools_construction,ff85ff517698c3fe8b200afddda7fb3ahousewares,ffbc83054b3741a8d67fc59d9cf9d42dhousewares
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002f98c0f7efd42638ed6100ca699b42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005d9a5423d47281ac463a968b3936fb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
014405982914c2cde2796ddcf0b8703d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01b1a7fdae9ad1837d6ab861705a1fa5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01cce1175ac3c4a450e3a0f856d02734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Hot encoding to make the data suitable for Apriori library
def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
  
# Encoding the datasets
masterDF_encoded = basket_bed_bath_table.applymap(hot_encode)
basket_bed_bath_table = masterDF_encoded

In [None]:
# Building the model
frq_items = apriori(basket_bed_bath_table, min_support = 0.001, use_colnames = True)
  
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                                         antecedents  \
0    (0042f1a9a7e0edd1400c6cd0fda065f8health_beauty)   
1        (3e45fc327c9740f1ae03383a8e201968perfumery)   
2       (007c63ae4b346920756b5adcad8095dehousewares)   
3  (38add59349dc5b9c3effc6b93925cb97furniture_decor)   
4     (b931645cdc2d9868f01544e8db63f5abgarden_tools)   

                                         consequents  antecedent support  \
0        (3e45fc327c9740f1ae03383a8e201968perfumery)            0.001385   
1    (0042f1a9a7e0edd1400c6cd0fda065f8health_beauty)            0.001385   
2  (38add59349dc5b9c3effc6b93925cb97furniture_decor)            0.001385   
3       (007c63ae4b346920756b5adcad8095dehousewares)            0.001385   
4       (00ba6d766f0b1d7b78a5ce3e1e033263housewares)            0.001385   

   consequent support   support  confidence   lift  leverage  conviction  
0            0.001385  0.001385         1.0  722.0  0.001383         inf  
1            0.001385  0.001385         1.0  722.0  0.00

In [None]:
import os  

In [None]:
os.makedirs('folder/subfolder', exist_ok=True)  
rules.to_csv('folder/subfolder/out.csv')  