## Import libraries ##

In [None]:
# importing the required libraries
%matplotlib inline
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
#reading the csv file from drive
import pandas as pd
bread = pd.read_csv('/content/gdrive/My Drive/BreadBasket_DMS.csv')

## Data Information ##

In [None]:
#displaying the initial data
bread.head(10)

In [None]:
#display data summary
bread.info()

***
## Check if any values missing ##

In [None]:
# check for missing values
bread.isnull().sum()

In [None]:
missing_value = ["NaN", "NONE", "None", "Nil", "nan", "none", "nil", 0]
print("There are {0} missing values in the dataframe.".format(len(bread[bread.Item.isin(missing_value)])))
bread[bread.Item.isin(missing_value)].head(10)

In [None]:
bread = bread.drop(bread[bread.Item == "NONE"].index)
bread = bread[bread.Transaction>0]
print("Number of rows: {0}".format(len(bread)))
bread.head(10)

***
## Merge Data and Time to DatetimeIndex ##

In [None]:
bread['Datetime'] = pd.to_datetime(bread['Date']+' '+bread['Time'])
bread = bread[["Datetime", "Transaction", "Item"]].set_index("Datetime")
bread.head(10)

In [None]:
#description of the dataset
Ser = bread.groupby('Transaction').nunique()
Ser.describe()

### Brief summary of the data ###

In [None]:
total_items = len(bread)
total_days = len(np.unique(bread.index.date))
total_months = len(np.unique(bread.index.month))
average_items = total_items / total_days
unique_items = bread.Item.unique().size

print("Bakery sells {} unique items ".format(unique_items))
print("Total {} items sold in {} days within {} months".format(total_items, total_days, total_months))
print("About an average of {} items are sold daily".format(average_items))


***
## Visualization ##

In [None]:
plt.figure(figsize=(10,10))
temp_series = bread['Item'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))
plt.pie(sizes, labels=labels, 
        autopct='%1.1f%%', startangle=200)
plt.title("Departments distribution", fontsize=15)
plt.savefig('Pie-chart')
plt.show()

In [None]:
# Plot histogram using matplotlib bar()
counter = Counter(df['Date'].dt.month)

author_names = counter.keys()
author_counts = counter.values()
print(len(counter))

indexes = np.arange(len(author_names))
width = 0.7
plt.bar(indexes, author_counts, width)
plt.xticks(indexes + width * 0.5, author_names)
plt.savefig('ss')
plt.title('Item Frequency')
plt.xlabel('Item Name')
plt.ylabel('Item Frequency')
plt.show()

In [None]:
# ranking the top 10 best-selling items
bread.Item.value_counts(normalize=True)[:10]

In [None]:
# create a bar chart, rank by percentage
bread.Item.value_counts(normalize=True)[:10].plot(kind="bar", title="Percentage of Sales by Item").set(xlabel="Item", ylabel="Percentage")

In [None]:
# create a bar chart, rank by value
bread.Item.value_counts()[:10].plot(kind="bar", title="Total Number of Sales by Item").set(xlabel="Item", ylabel="Total Number")

In [None]:
# plot time series chart of number of items by day
bread["Item"].resample("D").count().plot(figsize=(12,5), grid=True, title="Total Number of Items Sold by Date").set(xlabel="Date", ylabel="Total Number of Items Sold")

In [None]:
bread["Item"].resample("M").count()

In [None]:
# plot time series chart of number of items by month
bread["Item"].resample("M").count().plot(figsize=(12,5), grid=True, title="Total Number by Items Sold by Month").set(xlabel="Date", ylabel="Total Number of Items Sold")

In [None]:
# extract hour of the day and weekday of the week
# For Datetimeindex, the day of the week with Monday=0, Sunday=6, thereby +1 to become Monday=1, Sunday=7
bread["Hour"] = bread.index.hour
bread["Weekday"] = bread.index.weekday + 1

bread.head(10)

In [None]:
bread_groupby_hour = bread.groupby("Hour").agg({"Item": lambda item: item.count()/total_days})
bread_groupby_hour

In [None]:
# plot the chart
bread_groupby_hour.plot(y="Item", figsize=(12,5), title="Average Number by Items Sold by Hour of the Day").set(xlabel="Hour of the Day (24 hour time)", ylabel="Average Number of Items Sold")

In [None]:
# sales groupby weekday
bread_groupby_weekday = bread.groupby("Weekday").agg({"Item": lambda item: item.count()})
bread_groupby_weekday

In [None]:
# but we need to find out how many each weekday in that period of transaction
# in order to calculate the average items per weekday

import datetime 
daterange = pd.date_range(datetime.date(2016, 10, 30), datetime.date(2017, 4, 9))

monday = 0
tuesday = 0
wednesday = 0
thursday = 0
friday = 0
saturday = 0
sunday = 0

for day in np.unique(bread.index.date):
    if day.isoweekday() == 1:
        monday += 1
    elif day.isoweekday() == 2:
        tuesday += 1
    elif day.isoweekday() == 3:
        wednesday += 1
    elif day.isoweekday() == 4:
        thursday += 1        
    elif day.isoweekday() == 5:
        friday += 1        
    elif day.isoweekday() == 6:
        saturday += 1        
    elif day.isoweekday() == 7:
        sunday += 1        
        
all_weekdays = monday + tuesday + wednesday + thursday + friday + saturday + sunday

print("monday = {0}, tuesday = {1}, wednesday = {2}, thursday = {3}, friday = {4}, saturday = {5}, sunday = {6}, total = {7}".format(monday, tuesday, wednesday, thursday, friday, saturday, sunday, all_weekdays))

In [None]:
# apply the conditions to calculate the average items for each weekday
conditions = [
    (bread_groupby_weekday.index == 1),
    (bread_groupby_weekday.index == 2),
    (bread_groupby_weekday.index == 3),
    (bread_groupby_weekday.index == 4),
    (bread_groupby_weekday.index == 5),
    (bread_groupby_weekday.index == 6),
    (bread_groupby_weekday.index == 7)]

choices = [bread_groupby_weekday.Item/21, bread_groupby_weekday.Item/23, bread_groupby_weekday.Item/23, bread_groupby_weekday.Item/23, bread_groupby_weekday.Item/23, bread_groupby_weekday.Item/23, bread_groupby_weekday.Item/23]

bread_groupby_weekday["Average"] = np.select(conditions, choices, default=0)
bread_groupby_weekday

In [None]:
bread_groupby_weekday.plot(y="Average", figsize=(12,5), title="Average Number by Items Sold by Day of the Week").set(xlabel="Day of the Week (1=Monday, 7=Sunday)", ylabel="Average Number of Items Sold")

***
## Performing encoding of the data(One-hot encoding) ##

In [None]:
df = bread.groupby(["Transaction","Item"]).size().reset_index(name="Count")
df.head()

In [None]:
basket = (df.groupby(['Transaction', 'Item'])['Count']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Transaction'))
basket.head()

In [None]:
basket[basket.Coffee == 4].iloc[:,14:28]

In [None]:
# the encoding function
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1


In [None]:
basket_sets = basket.applymap(encode_units)
basket_sets.head()

In [None]:
basket_sets[basket_sets.Coffee == 1].iloc[3142:3145,14:28]

***
## Displaying the frequent itemsets ##
We assume minimum-support threshold of 1%


In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)
frequent_itemsets1 = apriori(basket_sets, min_support=0.005, use_colnames=True)
frequent_itemsets2 = apriori(basket_sets, min_support=0.001, use_colnames=True)

## Displaying association rules ##
Generate the rules with their corresponding support, confidence and lift with minimum threshold for lift of 1% sorted by descending confidence value.

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.sort_values("confidence", ascending = False, inplace = True)
rules[:50]

In [None]:
#displaying the comparison of rules for various suppoort values
from collections import Counter
import itertools 
counter = Counter(rules['confidence'])


for i in range(len(rules)):
  rules.confidence[i] = round(rules.confidence[i],1)

print(counter)
width = 2

lists = sorted(counter.items()) # sorted by key, return a list of tuples

x, y = zip(*lists) # unpack a list of pairs into two tuples
#-------------------
counter1 = Counter(rules1['confidence'])


for i in range(len(rules1)):
  rules1.confidence[i] = round(rules1.confidence[i],1)

print(counter1)
width = 2

lists1 = sorted(counter1.items()) # sorted by key, return a list of tuples

x1, y1 = zip(*lists1)
#-------------------------
counter2 = Counter(rules2['confidence'])


for i in range(len(rules2)):
  rules2.confidence[i] = round(rules2.confidence[i],1)

print(counter2)
width = 2

lists2 = sorted(counter2.items()) # sorted by key, return a list of tuples

x2, y2 = zip(*lists2)

plt.plot(x, y,'bo-')
plt.plot(x1, y1,'ko-')
plt.plot(x2, y2,'ro-')

plt.legend(["min-support:1%", "min-support:0.5%","min-support:0.1%"])
plt.savefig('1%')
plt.show()
