In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sales = pd.read_csv('./data/caspecoTrainingData.csv')
sales.head(20)

In [None]:
dates = pd.date_range('2020-01-01', '2023-01-04', freq='D')
sales['Sales'] = sales.groupby('Company')['Sales'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min()))
sales['Date'] = pd.to_datetime(sales['Date'])
# if there is a company which has no row for a date, we add a row with 0 sales
for company in sales['Company'].unique():
    for date in dates:
        if not date in sales[sales['Company'] == company]['Date'].values:
            sales = pd.concat([sales, pd.DataFrame({'Company': [company], 'Date': [date], 'Sales': [0]})])

sales['Date'] = pd.to_datetime(sales['Date'])
sales = sales.sort_values(by=['Date', 'Company'])
sales.head(20)

In [None]:
#plot the sales of each company in same plot
# make fig size 30x10
# normalize sales for each company
daterange = pd.date_range('2020-12-01', '2021-01-10', freq='D')

plt.figure(figsize=(30,10))
for company in [0,1,2]:
    plt.plot(sales[sales['Company'] == company]['Date'], sales[sales['Company'] == company]['Sales'])
plt.legend(sales['Company'].unique())
plt.xlim(daterange[0], daterange[-1])
plt.ylim(0, 0.5)
# plot each date on xtick
plt.xticks(daterange, rotation=45)
plt.grid()
plt.show()

In [None]:
sales['Day'] = sales['Date'].dt.day
sales['Month'] = sales['Date'].dt.month
sales['Year'] = sales['Date'].dt.year
sales['Weekday'] = sales['Date'].dt.weekday

sales.tail(30)

In [None]:
holidays = pd.read_csv('./data/Swedish holidays.csv')
# turn every 24th of december into a holiday
holidays['Date'] = pd.to_datetime(holidays['Date'])
holidays = holidays.append({'Date': pd.to_datetime('2020-12-24')}, ignore_index=True)
holidays = holidays.append({'Date': pd.to_datetime('2021-12-24')}, ignore_index=True)
holidays = holidays.append({'Date': pd.to_datetime('2022-12-24')}, ignore_index=True)
# turn every 30th of april into a holiday
holidays = holidays.append({'Date': pd.to_datetime('2020-04-30')}, ignore_index=True)
holidays = holidays.append({'Date': pd.to_datetime('2021-04-30')}, ignore_index=True)
holidays = holidays.append({'Date': pd.to_datetime('2022-04-30')}, ignore_index=True)
holidays.sort_values(by='Date', inplace=True)
sales['Holiday'] = sales['Date'].isin(holidays['Date']).astype(int)
sales.head()

# plot the sales of each company in same plot with holidays as vertical lines
# make fig size 120x10
plt.figure(figsize=(400,10))
for company in [0,1,2]:
  plt.plot(sales[sales['Company'] == company]['Date'], sales[sales['Company'] == company]['Sales'])
plt.legend(sales['Company'].unique())
plt.ylim(0, 1)
# plot each date on xtick
# x ticks as date and weekday
plt.xticks(sales['Date'][::3], labels=list(zip(sales['Month'], sales['Day'], sales['Weekday'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'})))[::3], rotation=80)
plt.grid()
# plot holidays as vertical lines
for holiday in holidays['Date']:
  plt.axvline(holiday, color='r')
plt.show()

In [None]:
sales['Closed'] = (sales['Sales'] == 0).astype(int)
sales.head()

In [None]:
X = sales.drop(['Sales', 'Date'], axis=1)
y = sales['Sales']

days_to_drop = 3 * 21
end_date = 3 * 21
dates = sales['Date'][-(days_to_drop + end_date)::3]

train_x, train_y = X[:-(days_to_drop + end_date)], y[:-(days_to_drop + end_date)]
test_x, test_y = X[-(days_to_drop + end_date):], y[-(days_to_drop + end_date):]

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

models = [
    RandomForestRegressor()
]

for model in models:
  model.fit(train_x, train_y)

  pred_y = model.predict(test_x)

  feature_importance = pd.Series(model.feature_importances_,
            index=train_x.columns).sort_values(ascending=False)
  print(feature_importance)

  #print score
  colors = {0: 'red', 1: 'green', 2: 'blue'}
  
  for company in [0,1,2]:
    plt.figure(figsize=(30, 10))
    company_sales = test_y[test_x['Company'] == company]
    plt.plot(dates, company_sales, label=f'{company} actual', color=colors[company])
    plt.plot(dates, pred_y[test_x['Company'] == company],
            label=f'{company} predicted', linestyle='--', color=colors[company])
    plt.xticks(dates, rotation=45)
    plt.grid()
    plt.legend()
