### Importing Libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

### Loading and Analyse dataset

In [2]:
df = pd.read_csv('Groceries_dataset.csv')

In [3]:
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [4]:
df.shape

(38765, 3)

#### setting Date as index column

In [5]:
df.set_index('Date',inplace = True)

In [6]:
df

Unnamed: 0_level_0,Member_number,itemDescription
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
21-07-2015,1808,tropical fruit
05-01-2015,2552,whole milk
19-09-2015,2300,pip fruit
12-12-2015,1187,other vegetables
01-02-2015,3037,whole milk
...,...,...
08-10-2014,4471,sliced cheese
23-02-2014,2022,candy
16-04-2014,1097,cake bar
03-12-2014,1510,fruit/vegetable juice


### applying date and time format  to our date column 

In [7]:
df.index=pd.to_datetime(df.index)

### Checking length of dataset

In [8]:
total_item = len(df)
total_days = len(np.unique(df.index.date))
total_year = len(np.unique(df.index.year))
print("Total items",total_item)
print("Total days",total_days)
print("Total year",total_year)

Total items 38765
Total days 728
Total year 2


### grouping dataset to form a list of products bought by same customer on same date

In [9]:
df=df.groupby(['Member_number','Date'])['itemDescription'].apply(lambda x: list(x))

In [10]:
df

Member_number  Date      
1000           2014-06-24                    [whole milk, pastry, salty snack]
               2015-03-15    [sausage, whole milk, semi-finished bread, yog...
               2015-05-27                           [soda, pickled vegetables]
               2015-07-24                       [canned beer, misc. beverages]
               2015-11-25                          [sausage, hygiene articles]
                                                   ...                        
4999           2015-05-16                    [butter milk, whipped/sour cream]
               2015-12-26                               [bottled water, herbs]
5000           2014-09-03                      [fruit/vegetable juice, onions]
               2014-11-16                     [bottled beer, other vegetables]
               2015-10-02         [soda, root vegetables, semi-finished bread]
Name: itemDescription, Length: 14963, dtype: object

In [11]:
df.shape

(14963,)

#### Now we have only rows 

### apriori takes list as an input, hence converting dataset to a list

In [12]:
transactions = df.values.tolist()
transactions

[['whole milk', 'pastry', 'salty snack'],
 ['sausage', 'whole milk', 'semi-finished bread', 'yogurt'],
 ['soda', 'pickled vegetables'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['sausage', 'whole milk', 'rolls/buns'],
 ['whole milk', 'soda'],
 ['frankfurter', 'soda', 'whipped/sour cream'],
 ['frankfurter', 'curd'],
 ['beef', 'white bread'],
 ['butter', 'whole milk'],
 ['frozen vegetables', 'other vegetables'],
 ['tropical fruit', 'sugar'],
 ['butter milk', 'specialty chocolate'],
 ['frozen meals', 'dental care'],
 ['rolls/buns', 'rolls/buns'],
 ['root vegetables', 'detergent'],
 ['sausage', 'rolls/buns'],
 ['dish cleaner', 'cling film/bags'],
 ['canned beer', 'frozen fish'],
 ['pip fruit', 'whole milk', 'tropical fruit'],
 ['root vegetables', 'whole milk', 'pastry'],
 ['rolls/buns', 'red/blush wine', 'chocolate'],
 ['other vegetables', 'shopping bags'],
 ['whole milk', 'chocolate', 'packaged fruit/vegetables', 'rolls/buns'],
 ['other vegetables', 'hygiene

##### let's take 3 item bought daily in seven days a week, so  our support 3*7/38765=0.0005

### Training the Apriori model on the dataset

In [13]:
from apyori import apriori
rules =apriori(transactions=transactions, min_support=0.0005, min_confidence = 0.05, min_lift=2, min_length=2, max_length=2)
results = list(rules)
results



[RelationRecord(items=frozenset({'whole milk', 'brandy'}), support=0.0008688097306689834, ordered_statistics=[OrderedStatistic(items_base=frozenset({'brandy'}), items_add=frozenset({'whole milk'}), confidence=0.34210526315789475, lift=2.1662805978127717)]),
 RelationRecord(items=frozenset({'mustard', 'frankfurter'}), support=0.0005346521419501437, ordered_statistics=[OrderedStatistic(items_base=frozenset({'mustard'}), items_add=frozenset({'frankfurter'}), confidence=0.08695652173913045, lift=2.302885725278954)]),
 RelationRecord(items=frozenset({'pickled vegetables', 'ham'}), support=0.0005346521419501437, ordered_statistics=[OrderedStatistic(items_base=frozenset({'pickled vegetables'}), items_add=frozenset({'ham'}), confidence=0.05970149253731344, lift=3.4895055970149254)]),
 RelationRecord(items=frozenset({'soft cheese', 'hamburger meat'}), support=0.0006014836596939117, ordered_statistics=[OrderedStatistic(items_base=frozenset({'soft cheese'}), items_add=frozenset({'hamburger meat'}

### Putting the results well organised into a Pandas DataFrame

In [14]:
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))
ordered_results = pd.DataFrame(inspect(results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])

In [15]:
ordered_results.head(10)


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
0,brandy,whole milk,0.000869,0.342105,2.166281
1,mustard,frankfurter,0.000535,0.086957,2.302886
2,pickled vegetables,ham,0.000535,0.059701,3.489506
3,soft cheese,hamburger meat,0.000601,0.06,2.745505
4,sweet spreads,pip fruit,0.000535,0.117647,2.398301
5,spices,soda,0.000601,0.225,2.317051
6,sweet spreads,tropical fruit,0.000735,0.161765,2.387066


### Displaying the results sorted by descending lifts

In [16]:
ordered_results.nlargest(n=10,columns='Lift')

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
2,pickled vegetables,ham,0.000535,0.059701,3.489506
3,soft cheese,hamburger meat,0.000601,0.06,2.745505
4,sweet spreads,pip fruit,0.000535,0.117647,2.398301
6,sweet spreads,tropical fruit,0.000735,0.161765,2.387066
5,spices,soda,0.000601,0.225,2.317051
1,mustard,frankfurter,0.000535,0.086957,2.302886
0,brandy,whole milk,0.000869,0.342105,2.166281
