In [None]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

In [None]:
store_calendar = pd.read_csv("calendar_afcs2021.csv", index_col=0)
date_converter = dict(zip(store_calendar['d'], store_calendar.index))

sell_prices = pd.read_csv("sell_prices_afcs2021.csv", index_col=0)

sample_submission = pd.read_csv("sample_submission_afcs2021.csv", index_col=0)

train_data = pd.read_csv("sales_train_validation_afcs2021.csv", index_col=0)
test_data = pd.read_csv("sales_test_validation_afcs2021.csv", index_col=0)
train_data = train_data.rename(columns=date_converter)
test_data = test_data.rename(columns=date_converter)

total_sales = train_data.sum()


# What does the data look like?

In [None]:
train_data

In [None]:
desc = train_data.transpose().describe()
desc

In [None]:
desc.loc['max'].idxmax()

# Total sales

In [None]:
total_sales.plot()

We see some days where the total sales are almost zero, and one very high peak. Lets explore that.

In [None]:
very_low_days = total_sales[total_sales < 100].index
print(very_low_days)

Ah, so the store seems to be closed on chistmas day (25th of december)

In [None]:
very_high_day = total_sales[total_sales > 3000].index
print(very_high_day)

In [None]:
store_calendar[store_calendar.index == very_high_day[0]]

Hmmm, it wasnt a really special day it seems. Maybe there was a massive sale or something like that? We'll look at that some other time

# What are some special days in the year?

Lets split the data set by year

In [None]:
ts_total_sales = pd.DataFrame({'values': total_sales.values}, index=pd.DatetimeIndex(total_sales.index))
ts_total_sales

In [None]:
total_sales_by_year = dict()
for year in range(2011, 2016):
    total_sales_by_year[year] = ts_total_sales[ts_total_sales.index.year == year]

In [None]:
total_sales_by_year[2015]

In [None]:
def get_sales_by_month(df):
    total_sales_by_month = dict()
    for month in range(1, 13):
        total_sales_by_month[month] = df[df.index.month == month]
    return total_sales_by_month

def get_highest_month(sales_by_month):
    return calendar.month_abbr[np.array([sum(sales_by_month[month].values) for month in sales_by_month.keys()]).argmax() + 1]

def get_lowest_month(sales_by_month):
    return calendar.month_abbr[np.array([sum(sales_by_month[month].values) for month in sales_by_month.keys()]).argmin() + 1]

In [None]:
for year in total_sales_by_year.keys():
    sales = total_sales_by_month = get_sales_by_month(total_sales_by_year[year])
    hm = get_highest_month(total_sales_by_month)
    lm = get_lowest_month(total_sales_by_month)
    print("In the year " + str(year) + ", the highest sale month was: " + hm + ". and the lowest was: " + lm)

# Attempt at forecasting

In [None]:
ts_train_data = train_data.transpose()
ts_train_data.index = pd.to_datetime(ts_train_data.index)
ts_train_data

In [None]:
products = list(ts_train_data.columns.values)

In [None]:
def make_forecast(data, product_names):
    submission = []
    
    for i in product_names:
        df = data[i]
        model = ExponentialSmoothing(df ,seasonal_periods=7 ,trend='add', seasonal='add') 
        fitted = model.fit() 
        fc = fitted.forecast(28).tolist()
        fc.insert(0, i)
        submission.append(fc)

    return pd.DataFrame(submission, columns=['id','F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13','F14','F15','F16','F17','F18','F19','F20','F21','F22','F23','F24','F25','F26','F27','F28'])
        
df = make_submission(ts_train_data, products)
df

In [None]:
df.to_csv("submission.csv", index=False)