In [2]:
# set up my import statements


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# I have my imports from the mixtend library, mlxtend gives us classes and functions that allow us to extract association rules from the 
# market baskets I will be analyzing (along with run the Apriori algorithim)

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [4]:
# the data that I will be using within this notebook will be the data that I had analyzed, gathered, and organized (DataExploration notebook)
# chart displays items sold together in the same invoice
items_per_transaction = pd.read_csv("./datasets/Items_per_Transaction.csv")
items_per_transaction

Unnamed: 0,InvoiceNo,Description
0,536370,"ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE..."
1,536403,HAND WARMER BIRD DESIGN
2,536527,"SET OF 6 T-LIGHTS SANTA,ROTATING SILVER ANGELS..."
3,536840,"JAM MAKING SET PRINTED,JAM JAR WITH PINK LID,J..."
4,536852,"PICTURE DOMINOES,MINI JIGSAW SPACEBOY,MINI JIG..."
...,...,...
1111,581494,"RABBIT NIGHT LIGHT,STRAWBERRY LUNCH BOX WITH C..."
1112,581570,"CHRISTMAS CRAFT TREE TOP ANGEL,PINK OWL SOFT T..."
1113,581574,"ROUND SNACK BOXES SET OF4 WOODLAND,ROUND SNACK..."
1114,581578,"SET/10 BLUE POLKADOT PARTY CANDLES,SET/10 PINK..."


In [5]:
# I need to know is which items were sold together..the invoice number is not needed
# I inoked item.split, this will provide me with a list (master list) of lists (sublist)
# Sublist will consist of products sold together (itemset)

item_list = [item.split(",") for item in items_per_transaction.Description]
item_list

[['ALARM CLOCK BAKELIKE PINK',
  'ALARM CLOCK BAKELIKE RED',
  'ALARM CLOCK BAKELIKE GREEN',
  'PANDA AND BUNNIES STICKER SHEET',
  'STARS GIFT TAPE',
  'INFLATABLE POLITICAL GLOBE',
  'VINTAGE HEADS AND TAILS CARD GAME',
  'SET/2 RED RETROSPOT TEA TOWELS',
  'ROUND SNACK BOXES SET OF4 WOODLAND',
  'SPACEBOY LUNCH BOX',
  'LUNCH BOX I LOVE LONDON',
  'CIRCUS PARADE LUNCH BOX',
  'CHARLOTTE BAG DOLLY GIRL DESIGN',
  'RED TOADSTOOL LED NIGHT LIGHT',
  'SET 2 TEA TOWELS I LOVE LONDON',
  'VINTAGE SEASIDE JIGSAW PUZZLES',
  'MINI JIGSAW CIRCUS PARADE',
  'MINI JIGSAW SPACEBOY',
  'MINI PAINT SET VINTAGE'],
 ['HAND WARMER BIRD DESIGN'],
 ['SET OF 6 T-LIGHTS SANTA',
  'ROTATING SILVER ANGELS T-LIGHT HLDR',
  'MULTI COLOUR SILVER T-LIGHT HOLDER',
  '5 HOOK HANGER MAGIC TOADSTOOL',
  '3 HOOK HANGER MAGIC GARDEN',
  '5 HOOK HANGER RED MAGIC TOADSTOOL',
  'ASSORTED COLOUR LIZARD SUCTION HOOK',
  'JUMBO BAG WOODLAND ANIMALS',
  'JUMBO BAG OWLS',
  'HOT WATER BOTTLE BABUSHKA',
  'HOMEMADE JAM SCEN

In [6]:
# before I can truly apply association rules learning..I have to encode the data Im working on
# inorder to do this I will be using the transaction encoder (from mlxtend)
# the transaction encoder takes in my data in the form of a python list and encodes as a NumPy array


te = TransactionEncoder()
te_array = te.fit(item_list).transform(item_list)
te_array

# this will display a two-dimensional array of true and false values
# if item is present in a invoice it will = true and if an item is not it will = false

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [7]:
# I represented ^ the same data from above in the form of a data frame.
# Where the colums are items might not be presnt in a invoice

item_df = pd.DataFrame(te_array, columns = te.columns_)
item_df

# looking at the columns it displays the individual items sold in the retail shop
# the rows represent the invoices, I also have a column with no item name this is 
# a empty string..it is an artifact of the item.split.
# every true/false entry in this data table indicates if said items were apart of a patriculare invoice 

Unnamed: 0,Unnamed: 1,1 HANGER,BACK DOOR,BILLBOARD FONTS DESIGN,BIRTHDAY CARD,BREAKFAST IN BED,CHOCOLATE SPOTS,DOUGHNUTS,NEW ENGLAND,OVERCROWDED POOL.,...,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC BOX SIGN HOME,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC HERB GARDEN CONTAINER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC WILLIE WINKIE CANDLE STICK
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1112,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1113,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1114,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# my artifact column dose not need to be apart of my data frame so I will be ridding of it ;(

item_df[""].value_counts()


False    1092
True       24
Name: count, dtype: int64

In [9]:
# I called drop to get rid of the column

item_df = item_df.drop("", axis = 1)
item_df.

#the data I am now left with are actual transactions

(1116, 2368)

In [10]:
# now that I've encoded my transaction data, I can check and see how many innvoices have a certain product

spaceboy_box_df = item_df[item_df["SPACEBOY LUNCH BOX"] == True]
spaceboy_box_df[["SPACEBOY LUNCH BOX"]]

# I can see how may transactions include the SPACEBOY LUNCH BOX (159 records == True)

Unnamed: 0,SPACEBOY LUNCH BOX
0,True
14,True
48,True
57,True
71,True
...,...
1052,True
1083,True
1106,True
1107,True


In [13]:
# I wanted a more complex query :).. I wanted to know how many transactions have the 
# SPACEBOY LUNCH BOX and DOLLY GIRL LUNCH BOX??

spaceboy_dolly = item_df[(item_df["SPACEBOY LUNCH BOX"] == True) &
                         (item_df["DOLLY GIRL LUNCH BOX"] == True)]

spaceboy_dolly[["SPACEBOY LUNCH BOX", "DOLLY GIRL LUNCH BOX"]]

# this provided me with the number of invoices, where these 2 lunch boxes were purchased together.



Unnamed: 0,SPACEBOY LUNCH BOX,DOLLY GIRL LUNCH BOX
48,True,True
74,True,True
87,True,True
96,True,True
105,True,True
...,...,...
993,True,True
997,True,True
1017,True,True
1029,True,True
