In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns

In [None]:
pip install mlxtend

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
# Reading data
colname = ['date', 'unique_id', 'event', 'price', 'product_id', 'category']
data = pd.read_csv("000.txt", names=colname, header=None)
df = data.copy()
df.head(10)

In [None]:
# Checking the dataset shape (Rows and Columns)
data.shape

In [None]:
# Checking for dataset information
data.info()

In [None]:
# Getting features with null/NA
data.isna().any()

In [None]:
# Number of nulls/NA
data.isna().sum()

In [None]:
# Checking record with null/na unique identifier
data[data['unique_id'].isna()]

# This record has all features as NaN and does not give any info. Its safe to remove.

In [None]:
data = data.dropna(subset=['unique_id'])

In [None]:
# Converting date column to datetime
data['datetim'] =  pd.to_datetime(data['date'], format='%Y-%m-%d')

# Adding day and week number
data['days'] =  data.datetim.dt.dayofyear
data['days'] =  data['days'].astype('int')

# Sorting day wise
data = data.sort_values(by=['days'])

# There are a huge number of products with no product id. Lets look at there prices

In [None]:
# Lets analyse the products
data['product_id'].describe()

# Lets look at there prices

In [None]:
data[data['product_id'] == '-']['price'].value_counts()

# All these products have a price of -1. These probably are offer products/Giftcards that cannot be counted as actual products that add to revenue directly. Removing such products

In [None]:
data = data[data['product_id'] != '-']

# As there is no way to determine category of products where it is NaN, we create a new category - Unknown and assign all such products to it

In [None]:
data['category'] = data['category'].fillna('Unknown')

In [None]:
# Dropping duplicate records
data = data.drop_duplicates()

In [None]:
data.shape

In [None]:
# Assuming products bought by same customer 
df['trnxn'] = df['date'] + df['unique_id']

# For the sake of keeping the data set small, I’m only considering 1L records. 
# I could have chosen records with top selling products to come up with better rules, but I am running out of time 

In [None]:
df1 = df.head(100000)

# Consolidating the items into 1 transaction per row with each product 1 hot encoded

In [None]:
basket = (df1.groupby(['trnxn','product_id']).count().unstack().reset_index().fillna(0).set_index('trnxn'))

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)

# Generating frequent item sets that have a support of at least 7% (this number was chosen so that I could get enough examples)

In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)

In [None]:
frequent_itemsets

# Generating the rules with their corresponding support, confidence and lift

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

# Picking rules with large lift (6) and high confidence (.8)

In [None]:
rules[ (rules['lift'] >= 2) &
       (rules['confidence'] >= 0.8) ]