In [None]:
import pandas as pd
import numpy as np
import calendar
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as spc

import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm
import matplotlib

from pylab import rcParams

In [None]:
np.random.seed(7)

In [None]:
store_calendar = pd.read_csv("calendar_afcs2021.csv", index_col=0)
date_converter = dict(zip(store_calendar['d'], store_calendar.index))

sell_prices = pd.read_csv("sell_prices_afcs2021.csv", index_col=0)

sample_submission = pd.read_csv("sample_submission_afcs2021.csv", index_col=0)

train_data = pd.read_csv("sales_train_validation_afcs2021.csv", index_col=0)
train_data = train_data.rename(columns=date_converter).T
train_data.index = pd.to_datetime(train_data.index)

test_data = pd.read_csv("sales_test_validation_afcs2021.csv", index_col=0)
test_data = test_data.rename(columns=date_converter).T
test_data.index = pd.to_datetime(test_data.index)

total_sales = train_data.T.sum()

In [None]:
store_calendar

# What does the data look like?

In [None]:
train_data

In [None]:
desc = train_data.describe()
desc

In [None]:
desc.loc['max'].idxmax()

In [None]:
train_data['FOODS_3_230_TX_3_validation'].apply(np.sqrt).plot()

In [None]:
train_data['FOODS_3_021_TX_3_validation'].hist()



In [None]:
sample = np.random.randint(100, 800, 4)
fig, axes = plt.subplots(2, 2)
c = 0
for i in range(2):
    for j in range(2):
        axes[i, j].set_title('Sample of product: ' + str(sample[c]))
        column_name = 'FOODS_3_' + str(sample[c]) + '_TX_3_validation'
        train_data[column_name].hist(ax = axes[i,j])
        axes[i,j].set(xlabel='n products sold in a day', ylabel='n occurrences in data')
        c += 1
fig.tight_layout(pad=1.0)
plt.savefig('figures/sale_dist.png')
plt.show()

# Total sales

In [None]:
ax = total_sales.plot()
ax.set(xlabel='date (day)', ylabel='amount of products sold')
plt.savefig('figures/total_sales.png')
plt.show()

We see some days where the total sales are almost zero, and one very high peak. Lets explore that.

In [None]:
very_low_days = total_sales[total_sales < 100].index
print(very_low_days)

Ah, so the store seems to be closed on chistmas day (25th of december)

In [None]:
very_high_day = total_sales[total_sales > 3000].index
print(very_high_day)

Hmmm, it wasnt a really special day it seems. Maybe there was a massive sale or something like that? We'll look at that some other time

In [None]:
rcParams['figure.figsize'] = 18, 8
before = total_sales[total_sales.index.year < 2014]
after = before[before.index.year > 2012]

decomposition = sm.tsa.seasonal_decompose(after, model='additive')
fig = decomposition.plot()
plt.savefig('figures/total_sales_decomp.png')
plt.show()

In [None]:
decomposition.seasonal

# What are some special days in the year?

Lets split the data set by year

In [None]:
total_sales_by_year = dict()
for year in range(2011, 2016):
    total_sales_by_year[year] = total_sales[total_sales.index.year == year]

In [None]:
total_sales_by_year[2015]

In [None]:
def get_sales_by_month(df):
    total_sales_by_month = dict()
    for month in range(1, 13):
        total_sales_by_month[month] = df[df.index.month == month]
    return total_sales_by_month

def get_highest_month(sales_by_month):
    return calendar.month_abbr[np.array([sum(sales_by_month[month].values) for month in sales_by_month.keys()]).argmax() + 1]

def get_lowest_month(sales_by_month):
    return calendar.month_abbr[np.array([sum(sales_by_month[month].values) for month in sales_by_month.keys()]).argmin() + 1]

In [None]:
for year in total_sales_by_year.keys():
    sales = total_sales_by_month = get_sales_by_month(total_sales_by_year[year])
    hm = get_highest_month(total_sales_by_month)
    lm = get_lowest_month(total_sales_by_month)
    print("In the year " + str(year) + ", the highest sale month was: " + hm + ". and the lowest was: " + lm)

# Preprocessing the days

In [None]:
daysinweek = 7
store_calendar['sin_wday'] = np.sin(2*np.pi*store_calendar.wday/daysinweek)
store_calendar['cos_wday'] = np.cos(2*np.pi*store_calendar.wday/daysinweek)

In [None]:
monthsinyear = 12
store_calendar['sin_month'] = np.sin(2*np.pi*store_calendar.month/monthsinyear)
store_calendar['cos_month'] = np.cos(2*np.pi*store_calendar.month/monthsinyear)

In [None]:
fig, axes = plt.subplots(1, 2)
axes[0].scatter(store_calendar['sin_wday'], store_calendar['cos_wday'])
axes[0].set_title('Transformed days of the week')

axes[1].scatter(store_calendar['sin_month'], store_calendar['cos_month'])
axes[1].set_title('Transformed months of the year')

plt.savefig('figures/day_dist.png')
plt.show()

In [None]:
store_calendar.to_csv("Calendar_with_cycled_days.csv")

# Unsupervised Clustering

In [None]:
ts_train_data = train_data.transpose()
ts_train_data.index = pd.to_datetime(ts_train_data.index)
ts_train_data

In [None]:
all_sales_by_year = dict()
for year in range(2011, 2016):
    year_data = ts_train_data[ts_train_data.index.year == year]
    start_month = year_data.index.month
    start_day = year_data.index[0].day
    year_data.index = list(map('-'.join, zip(np.array(year_data.index.month).astype(str), np.array(year_data.index.day).astype(str))))
    all_sales_by_year[year] = year_data

In [None]:
total = all_sales_by_year[list(all_sales_by_year.keys())[0]]
for year in list(all_sales_by_year.keys())[1:]:
    total = total.add(all_sales_by_year[year], fill_value=0)

In [None]:
total

In [None]:
dt = []
for i in total.index:
    dt.append("2000-" + i)
total.index = dt
total.index = pd.to_datetime(total.index)

In [None]:
total["FOODS_3_420_TX_3_validation"].plot()

In [None]:
total.corr()

In [None]:
corr = total.corr().values

pdist = spc.distance.pdist(corr)
linkage = spc.linkage(pdist, method='complete')
idx = spc.fcluster(linkage, 0.5 * pdist.max(), 'distance')

In [None]:
linkage

In [None]:
df = total.loc[:, "FOODS_3_001_TX_3_validation": "FOODS_3_100_TX_3_validation"]
correlations = df.corr()

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

plt.figure(figsize=(12,5))
dissimilarity = 1 - abs(correlations)
Z = linkage(squareform(dissimilarity), 'complete')

dendrogram(Z, labels=df.columns, orientation='top', 
           leaf_rotation=90);