Importing & reading in data

In [1]:
from requests import get
from pandas import DataFrame, concat, to_datetime, to_timedelta, json_normalize, notna, isna
from time import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
from math import ceil
from plotly.graph_objects import Scatter
import plotly.figure_factory as ff
from plotly.express import colors
from dotenv import load_dotenv
from datetime import datetime
from time import time
import json, os

ModuleNotFoundError: No module named 'matplotlib_venn'

In [None]:
fetchSaleData = True
fetchReturnData = True

lastUpdatedInDays = 1

In [None]:
# current time
currentTime = time()
timeToCheckAgainst = datetime.fromtimestamp(currentTime - lastUpdatedInDays*24*60*60)

if not os.path.exists('../data files'):
    os.makedirs('../data files')

print('Checking for sale data...')
if os.path.exists("../data files/rawSaleData.json"):
    saleDataTime = datetime.fromtimestamp(os.path.getmtime('../data files/rawSaleData.json'))
    print('Found sale data from:', saleDataTime.strftime('%c'))
    
    # if older than 'lastUpdatedInDays' days, fetch again
    if saleDataTime <= timeToCheckAgainst:
        print('Locally saved data is older than {} hours!'.format(lastUpdatedInDays*24))
        print('Have to fetch again...')
        fetchSaleData = True
    else:
        print('Will be using this locally saved data...')
        fetchSaleData = False
else:
    print('No local data found!')
    print('Have to fetch again...')
    fetchSaleData = True

print('Checking for return data...')
if os.path.exists("../data files/rawReturnData.json"):
    returnDataTime = datetime.fromtimestamp(os.path.getmtime('../data files/rawReturnData.json'))
    print('Found return data from:', returnDataTime.strftime('%c'))
    
    # if older than 'lastUpdatedInDays' days, fetch again
    if returnDataTime <= timeToCheckAgainst:
        print('Locally saved data is older than {} hours!'.format(lastUpdatedInDays*24))
        print('Have to fetch again...')
        fetchReturnData = True
    else:
        print('Will be using this locally saved data...')
        fetchReturnData = False
else:
    print('No local data found!')
    print('Have to fetch again...')
    fetchReturnData = True

In [None]:
load_dotenv()

domain = os.getenv("URL")

if fetchSaleData:
    print('Getting Sale data... Please wait!')
    startTime = time()

    # get sale data
    response = get('https://' + domain + 'sale')
    saleData = response.json()

    print('Took', round(time() - startTime, 2), 'seconds to get Sale data.')
    
    # save locally for later reuse
    with open('../data files/rawSaleData.json', 'w') as fp:
        json.dump(saleData, fp, indent=4)
else:
    # use locally saved data
    with open('../data files/rawSaleData.json', 'r') as fp:
        saleData = json.load(fp)

In [None]:
if fetchReturnData:
    print('Getting Return data... Please wait!')
    startTime = time()

    # get return data
    response = get('https://' + domain + 'return')
    returnData = response.json()

    print('Took', round(time() - startTime, 2), 'seconds to get Return data.')
    
    # save locally for later reuse
    with open('../data files/rawReturnData.json', 'w') as fp:
        json.dump(returnData, fp, indent=4)
else:
    # use locally saved data
    with open('../data files/rawReturnData.json', 'r') as fp:
        returnData = json.load(fp)

# Reading in & cleaning sales data

In [None]:
# build dataframe
saleDF = DataFrame.from_dict(saleData, orient='columns')

# drop columns
saleDF = saleDF.drop(['scanned_at', 'scanned_at_melbourne_date_time', 'return_id'], axis=1)

# rename
saleDF = saleDF.rename(columns={"scanned_at_melbourne_date": "sale_date", "scanned_at_melbourne_time": "sale_time", "id": "sale_id"})

# merging date and time, converting new column to datetime64 datatype
saleDF['date_time_concat'] = saleDF['sale_date'] + ' ' + saleDF['sale_time']
saleDF['date_time'] = to_datetime(saleDF['date_time_concat'], dayfirst=True)
saleDF = saleDF.drop(['sale_date', 'sale_time', 'date_time_concat'], axis=1)

# filter to get records post 22nd Jan
saleDF = saleDF[saleDF['date_time'] >= '2020/01/22 00:00:00']
saleDF = saleDF.sort_values(by='date_time', ascending = True).reset_index(drop=True)

saleDF.head()

In [None]:
saleDF['week'] =  saleDF.date_time.dt.isocalendar().week
saleDF.head()

In [None]:
groupDF = saleDF.groupby(by=['cup_id', 'week']).size().reset_index(name='occurrence_per_week')
#groupDF.loc[groupDF['occurrence_per_week'] > 1, :]
groupDF.head()

In [None]:
cupsDF = groupDF.groupby('occurrence_per_week')['cup_id'].unique().reset_index()
cupsDF

In [None]:
v = venn2([set(cupsDF.loc[cupsDF.occurrence_per_week == 1, 'cup_id'][0].tolist()), 
           set(cupsDF.loc[cupsDF.occurrence_per_week == 2, 'cup_id'][1].tolist())],
          set_labels=('Once', 'Twice'),
          set_colors=('cyan', 'green'),
          alpha=0.5
          )
v.get_label_by_id('A').set_color('black')
v.get_label_by_id('B').set_color('black')

for text in v.set_labels:
    text.set_fontsize(16)
for text in v.subset_labels:
    text.set_fontsize(16)

plt.show()

In [None]:
occurDF = groupDF.groupby(by=['cup_id', 'occurrence_per_week']).size().reset_index(name='num_weeks')
occurDF = occurDF.pivot(index='cup_id', columns='occurrence_per_week', values='num_weeks')
occurDF.columns.name = None
occurDF = occurDF.reset_index().rename(columns={1: 'once', 2: 'twice'})
occurDF.head()

In [None]:
# get number of cups used once in any week
once_only = len(occurDF[occurDF.twice.isna()])
# get number of cups used twice in any week
twice_only = len(occurDF[occurDF.once.isna()])
# get number of cups used once or twice in any week
once_or_twice = len(occurDF[occurDF.once.notna() & occurDF.twice.notna()])
print(once_only, twice_only, once_or_twice)

In [None]:
# labels are set different to size to allow denote distinct counts within each group in the Venn diagram in d3.js
jsonObject = {
    'fromSale': [
                    {'sets': ['Once'], 'size': once_only + once_or_twice, 'label': str(once_only)}, 
                    {'sets': ['Twice'], 'size': twice_only + once_or_twice, 'label': str(twice_only)},
                    {'sets': ['Once','Twice'], 'size': once_or_twice, 'label': str(once_or_twice)}
                ]
}
jsonObject['saleUnion'] = once_only + twice_only + once_or_twice
jsonObject['saleCategories'] = ['Once', 'Twice']
jsonObject

In [None]:
days = []
for cup in saleDF.cup_id.unique():
    temp = saleDF[saleDF.cup_id == cup]

    if len(temp) > 1:
        for i in range(len(temp)-1):
            days.append((temp.iloc[i+1, 3] - temp.iloc[i, 3]) / to_timedelta(1, unit='D'))

In [None]:
# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=1)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time (Bin size: 1 day)",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days)), 1))
height = round(max(counts)/sum(counts),2)
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(0.5, ceil(max(days)), 1),
    y=[round(count/len(days),4)+0.005 for count in counts],
    mode="text",
    text=[str(percent) for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
bin_days = 2

# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=bin_days)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time (Bin size: 2 days)",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)/bin_days), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days))+1, bin_days))
height = round(max(counts)/sum(counts),2)/bin_days
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(1, ceil(max(days)), bin_days),
    y=[round(count/len(days),4)/bin_days +0.005 for count in counts],
    mode="text",
    text=[str(percent) + '%' for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
for index in range(len(days)):
    if days[index] > 30:
        days[index] = 30.1

In [None]:
# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=1)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time (Bin size: 1 day)",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days)), 1))
height = round(max(counts)/sum(counts),2)
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(0.5, ceil(max(days)), 1),
    y=[round(count/len(days),4)+0.005 for count in counts],
    mode="text",
    text=[str(percent) for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
bin_days = 2

# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=bin_days)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time (Bin size: 2 days)",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)/bin_days), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days))+1, bin_days))
height = round(max(counts)/sum(counts),2)/bin_days
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(1, ceil(max(days)), bin_days),
    y=[round(count/len(days),4)/bin_days +0.005 for count in counts],
    mode="text",
    text=[str(percent) + '%' for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
days.sort()
saleFrame = DataFrame({'duration': days})
saleFrame.to_csv('../data files/reuseDurationsFromSale.csv', index = False)
saleFrame.head()

# Reading and cleaning Return Data

In [None]:
# build dataframe
returnDF = DataFrame.from_dict(json_normalize(returnData), orient='columns')

# dropping columns 
returnDF = returnDF.drop(['scanned_at', 'bin_id', 'cafe_id', 'dishwasher_id', 'id'], axis=1)

# renaming columns
returnDF = returnDF.rename(columns={"scanned_at_melbourne_date_time.date": "return_date", "scanned_at_melbourne_date_time.time": "return_time"})

# merging date and time, converting new column to datetime64 datatype
returnDF['date_time_concat'] = returnDF['return_date'] + ' ' + returnDF['return_time']
returnDF['date_time'] = to_datetime(returnDF['date_time_concat'], dayfirst=True)
returnDF = returnDF.drop(['return_date', 'return_time', 'date_time_concat'], axis=1)

# filter to get records post 22nd Jan
returnDF = returnDF[returnDF['date_time'] >= '2020/01/22 00:00:00']
returnDF = returnDF.sort_values(by='date_time', ascending = True)

returnDF.head()

In [None]:
returnDF['week'] =  returnDF.date_time.dt.isocalendar().week
returnDF.head()

In [None]:
groupDF = returnDF.groupby(by=['cup_id', 'week']).size().reset_index(name='occurrence_per_week')
#groupDF.loc[groupDF['occurrence_per_week'] > 1, :]
groupDF.head()

In [None]:
cupsDF = groupDF.groupby('occurrence_per_week')['cup_id'].unique().reset_index()
cupsDF

In [None]:
a = set(cupsDF.loc[cupsDF.occurrence_per_week == 1, 'cup_id'][0].tolist())
b = set(cupsDF.loc[cupsDF.occurrence_per_week == 2, 'cup_id'][1].tolist())
c = set(cupsDF.loc[cupsDF.occurrence_per_week == 3, 'cup_id'][2].tolist())

once = len(a)
twice = len(b)
thrice = len(c)

once_only = len(a - b - c)
twice_only = len(b - a - c)
thrice_only = len(c - a - b)

once_twice = len(a.intersection(b))
twice_thrice = len(b.intersection(c))
once_thrice = len(a.intersection(c))

once_twice_thrice = len(a.intersection(b).intersection(c))

In [None]:
v = venn3(subsets=(once_only, twice_only, once_twice, thrice_only, once_thrice, twice_thrice, once_twice_thrice),
      set_labels=('Once', 'Twice', 'Thrice'),
      alpha=0.5
      )
v.get_label_by_id('A').set_color('black')
v.get_label_by_id('B').set_color('black')
v.get_label_by_id('C').set_color('black')

for text in v.set_labels:
    text.set_fontsize(16)
#for text in v.subset_labels:
#    text.set_fontsize(16)

plt.show()

In [None]:
# labels are set different to size to allow denote distinct counts within each group in the Venn diagram in d3.js
jsonObject['fromReturn'] = [
                                {'sets': ['Once'], 'size': once, 'label': str(once_only)}, 
                                {'sets': ['Twice'], 'size': twice, 'label': str(twice_only)},
                                {'sets': ['Thrice'], 'size': thrice, 'label': str(thrice_only)},
                                {'sets': ['Once','Twice'], 'size': once_twice, 'label': str(once_twice)},
                                {'sets': ['Twice', 'Thrice'], 'size': twice_thrice, 'label': str(twice_thrice)},
                                {'sets': ['Once','Thrice'], 'size': once_thrice, 'label': str(once_thrice)},
                                {'sets': ['Once','Twice', 'Thrice'], 'size': once_twice_thrice, 'label': str(once_twice_thrice)}
                            ]
jsonObject['returnUnion'] = once + twice + thrice - once_twice - twice_thrice - once_thrice + once_twice_thrice
jsonObject['returnCategories'] = ['Once', 'Twice', 'Thrice']

with open('../data files/reusePerWeekNumbers.json', 'w') as fp:
    json.dump(jsonObject, fp, indent=4)
    
jsonObject

In [None]:
days = []
for cup in returnDF.cup_id.unique():
    temp = returnDF[returnDF.cup_id == cup]

    if len(temp) > 1:
        for i in range(len(temp)-1):
            days.append((temp.iloc[i+1, 1] - temp.iloc[i, 1]) / to_timedelta(1, unit='D'))

In [None]:
# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=1)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days)), 1))
height = round(max(counts)/sum(counts),2)
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(0.5, ceil(max(days)), 1),
    y=[round(count/len(days),4)+0.005 for count in counts],
    mode="text",
    text=[str(percent) for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
for index in range(len(days)):
    if days[index] > 30:
        days[index] = 30.1

In [None]:
# name of the dataset
group_labels = ['Cups reuse time']

fig = ff.create_distplot([np.array(days)], group_labels, show_rug=False, bin_size=1)
fig.update_layout(xaxis=dict(range=[-1, ceil(max(days))+1]))

fig.update_layout(
    title="Histogram with probability density curve for cups reuse time",
    xaxis_title="Duration in days",
    yaxis_title="Probability density",
    xaxis = dict(tickmode = 'linear')
    )

fig.update_traces(marker=dict(line=dict(width=1,
                                        color='Black')),
                  nbinsx=ceil(max(days)), 
                  autobinx=True, 
                  selector={'type':'histogram'}
                  )

counts, bins = np.histogram(days, bins=range(0, ceil(max(days)), 1))
height = round(max(counts)/sum(counts),2)
percents = [int(round(count/len(days)*100)) for count in counts]

fig.add_trace(Scatter(
    x=[ceil(max(days))],
    y=[height],
    mode="text",
    text='Total durations: {}'.format(len(days)),
    textposition="top left",
    showlegend=False
))

fig.add_trace(Scatter(
    x=np.arange(0.5, ceil(max(days)), 1),
    y=[round(count/len(days),4)+0.005 for count in counts],
    mode="text",
    text=[str(percent) for percent in percents],
    textposition="top center",
    showlegend=False,
    hoverinfo='skip'
))


fig.show()

In [None]:
days.sort()
returnFrame = DataFrame({'duration': days})
returnFrame.to_csv('../data files/reuseDurationsFromReturn.csv', index = False)
returnFrame.head()