In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
df = pd.read_csv("/kaggle/input/data-storm-semifinals/train_data.csv")


In [4]:
promo_df = pd.read_csv("/kaggle/input/data-storm-semifinals/promotion_data.csv")
promo_df

In [256]:
pd.set_option('display.max_rows', 10)

In [5]:
training_range = pd.date_range('2021-10-01', '2022-02-17')
training_week_range = pd.date_range('2021-10-01', '2022-02-17', freq = 'W-THU')
testing_range = pd.date_range('2022-02-24', '2022-03-17', freq='W-THU')
test_weeks = {'w'+str(i+1): date for i, date in zip(range(4), testing_range)}

In [6]:
def week_of_month(sunday):
    return (sunday.day - 1) // 7 + 1

In [7]:
def promo_discount(row):
    if row.DiscountType == 'Percentage':
        return row.DiscountValue
    elif row.DiscountType == 'Amount':
        return row.DiscountValue * 100 / row.SellingPrice
    
def promo_amount(row):
    if row.DiscountType == 'Percentage':
        return row.SellingPrice * row.DiscountValue / 100
    elif row.DiscountType == 'Amount':
        return row.DiscountValue

In [8]:
def get_promos(item_code):
    item_promos = promo_df.loc[promo_df['ItemCode'] == item_code]
    if item_promos.empty:
        return item_promos.set_index(pd.to_datetime([]))
    item_promos.PromotionEndDate = pd.to_datetime(item_promos.PromotionEndDate)
    item_promos.set_index(item_promos.PromotionEndDate, inplace = True)
    item_promos['discount'] = item_promos.apply(promo_discount, axis = 1)
    item_promos['amount'] = item_promos.apply(promo_amount, axis = 1)
    item_promos = item_promos.sort_index()
    return item_promos

In [9]:
test_discounts = {}

In [10]:
def data_for_item(item_code):
    filtered_df = df.loc[df['ItemCode'] == item_code]
    filtered_df["DateID"] = pd.to_datetime(filtered_df["DateID"], format = "%m/%d/%Y")
        
    filtered_df.set_index(filtered_df.DateID, inplace=True)
    filtered_df.drop('DateID', axis=1, inplace=True)
    filtered_df.drop('ItemCode', axis = 1, inplace = True)
    filtered_df.drop('CategoryCode', axis = 1, inplace = True)
    
    for date in training_range:
        if date not in filtered_df.index:
            filtered_df.loc[date] = [0]
    
    filtered_df = filtered_df.sort_index()
    
    weekly_sales = filtered_df.groupby(pd.Grouper(freq='W-THU')).sum()
    weekly_sales.rename(columns = {'DailySales': 'WeeklySales'}, inplace = True)
    weekly_sales = create_lag(weekly_sales)
    weekly_sales['month'] = weekly_sales.index.month
    weekly_sales['week'] = weekly_sales.index.map(week_of_month)
    
    item_promos = get_promos(item_code)
    discounts = []
    amounts = []
    last_discount = (0, 0)
    for week in weekly_sales.index:
        if week in item_promos.index:
            row = item_promos.loc[week]
            discounts.append(row.discount)
            amounts.append(row.amount)
            last_discount = (row.discount, row.amount)
        else:
            discounts.append(0)
            amounts.append(0)
    weekly_sales['discount_perc'] = discounts
    weekly_sales['discount_amount'] = amounts
    test_discounts[item_code] = item_promos[str(testing_range[0]): str(testing_range[-1])]
            
    train_x = weekly_sales.loc[:, weekly_sales.columns != 'WeeklySales']
    train_y = weekly_sales['WeeklySales']
    return weekly_sales, train_x, train_y
    

In [11]:
def create_lag(df3):
    dataframe = pd.DataFrame()
    for i in range(2, 0, -1):
        dataframe['lag-' + str(i)] = df3.WeeklySales.shift(i)
    df4 = pd.concat([df3, dataframe], axis=1)
    df4.dropna(inplace=True)
    return df4

In [12]:
weekly_sales, train_x, train_y = data_for_item(124954)
print("Feature data for item 124954")
weekly_sales

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
no_promo = pd.DataFrame({'promo': [0, 0, 0, 0]}, index = testing_range)
no_promo

In [None]:
all_under_errors = []

In [None]:
feature_importance = np.zeros(train_x.columns.shape)

In [None]:
def train_predict(item_code, weekly_sales, train_x, train_y, promos = no_promo):
    global feature_importance
    
    model = RandomForestRegressor(random_state = 10)
    fit = model.fit(train_x, train_y)
    results = []
    feature_importance += model.feature_importances_
    train_pred = fit.predict(train_x)
    train_error = accuracy(train_y.values, train_pred)
    under_error = under_forecast_mape(train_y.values, train_pred)
    buffer = under_forecast_buffer(train_y.values, train_pred)
    all_under_errors.extend(np.maximum(0, train_y - train_pred))
    
    for date in testing_range:
        prev_row = weekly_sales.iloc[-1]
        test_x = pd.DataFrame({'lag-2': [prev_row['lag-1']],
                               'lag-1':[prev_row['WeeklySales']], 
                               'month': [date.month], 
                               'week': [week_of_month(date)],
                               })
        if promos.loc[date][0]:
            discount = test_discounts[item_code].loc[date].discount
            amount = test_discounts[item_code].loc[date].amount
        else:
            discount = amount = 0
        test_x['discount_perc'] = [discount]
        test_x['discount_amount'] = [amount]
        
        prediction = round(fit.predict(test_x)[0])
        test_x['WeeklySales'] = [prediction]
        test_x.index = [date]
        #print(test_x)
        weekly_sales = weekly_sales.append(test_x)
        results.append(prediction)
    
    #print(weekly_sales)
    return results, train_error, buffer
        

In [None]:
def predict_for_one_item(item_code, promos = no_promo):
    weekly_sales, train_x, train_y = data_for_item(item_code)
    result, error, buffer = train_predict(item_code, weekly_sales, train_x, train_y, promos)
    print("Item {} Training error: {}".format(item_code, error))
    return result, buffer

In [None]:
def accuracy(actual, pred):
    diff = np.sum(np.abs(actual - pred))
    return diff * 100 / np.sum(actual)

In [None]:
def under_forecast_mape(actual, pred):
    return np.sum(np.maximum(0, actual - pred)) * 100 / np.sum(actual)

In [None]:
def under_forecast_buffer(actual, pred):
    under_errors = np.maximum(0, actual - pred)
    under_errors = under_errors[under_errors != 0]
    under_avg = np.mean(under_errors)
    under_std = np.std(under_errors)
    return under_avg + 2*under_std

In [None]:
actual = np.array([10,2,3,4])
pred = np.array([4,3,2,1])
buf = under_forecast_buffer(actual, pred)
print(under_forecast_mape(actual, pred))
print(under_forecast_mape(actual, pred + buf))
print(accuracy(actual,pred))
print(accuracy(actual, pred + buf))
plt.plot(actual)
plt.plot(pred)
plt.plot(pred+buf)

In [None]:
predict_for_one_item(1066570)

In [None]:
def get_test_promos(testdf, item_code):
    filter_promos = testdf.loc[testdf.ItemCode == item_code]
    promos = no_promo.copy()
    for row in filter_promos.itertuples():
        week = test_weeks[row.Week]
        promos.loc[week] = [row.OnPromo]
    return promos
        

In [None]:
def predict_for_test(path):
    val_df = pd.read_csv(path)
    results = {}
    for item_code in val_df.ItemCode.unique():
        promos = get_test_promos(val_df, item_code)
        sales, buf = predict_for_one_item(item_code, promos)
        weeks = {}
        for i in range(4):
            weeks['w'+str(i+1)] = [sales[i], round(sales[i] + buf)]
        results[item_code] = weeks
    val_df['PredictedSales'] = val_df.apply(lambda row: results[row.ItemCode][row.Week][0], axis = 1)
    val_df['BufferPredict'] = val_df.apply(lambda row: results[row.ItemCode][row.Week][1], axis = 1)
    #val_df['ID'] = val_df.apply(lambda row: row.CategoryCode + '_' + str(row.ItemCode) + '_' + row.Week, axis = 1)
    
    if 'WeeklySales' in val_df:
        print("Accuracy:", accuracy(val_df.WeeklySales, val_df.PredictedSales))
    return val_df
    

In [None]:
def add_buffer(val_df):
    print("Add buffer considering all the items at once")
    actual = val_df.WeeklySales
    pred = val_df.PredictedSales
    under_errors = np.array([i for i in all_under_errors if i != 0])
    under_avg = np.mean(under_errors)
    under_std = np.std(under_errors)
    buffer = under_avg + 2*under_std

    errors = pd.DataFrame(columns = ['MAPE', 'UnderErrorMAPE'])
    errors.loc['Prediction'] = [accuracy(actual, pred), under_forecast_mape(actual, pred)]
    
    buf_pred = pred + buffer
    errors.loc['Prediction + Buffer'] = [accuracy(actual, buf_pred), under_forecast_mape(actual, buf_pred)]
    return errors

In [None]:
def error_summary(actual, pred, buf_pred):
    errors = pd.DataFrame(columns = ['MAPE', 'UnderErrorMAPE'])
    errors.loc['Prediction'] = [accuracy(actual, pred), under_forecast_mape(actual, pred)]
    errors.loc['Prediction + Buffer'] = [accuracy(actual, buf_pred), under_forecast_mape(actual, buf_pred)]
    return errors

In [None]:
val_df = predict_for_test('/kaggle/input/data-storm-semifinals/validation_data.csv')

In [None]:
val_df

In [None]:
add_buffer(val_df)

In [None]:
error_summary(val_df.WeeklySales, val_df.PredictedSales, val_df.BufferPredict)

In [None]:
get_promos(836152)['2022-02-24':'2022-03-17']

In [None]:
test_df = predict_for_test('/kaggle/input/data-storm-semifinals/test_data.csv')

In [None]:
test_df

In [None]:
submission_df = test_df.drop("BufferPredict", axis = 1)
submission_df.to_csv('submission4.csv', index=False)

In [None]:
submission_df

In [None]:
def data_for_item_sales(item_code):
    filtered_df = df.loc[df['ItemCode'] == item_code]
    filtered_df["DateID"] = pd.to_datetime(filtered_df["DateID"], format = "%m/%d/%Y")
        
    filtered_df.set_index(filtered_df.DateID, inplace=True)
    filtered_df.drop('DateID', axis=1, inplace=True)
    filtered_df.drop('ItemCode', axis = 1, inplace = True)
    filtered_df.drop('CategoryCode', axis = 1, inplace = True)
    
    for date in training_range:
        if date not in filtered_df.index:
            filtered_df.loc[date] = [0]
    
    filtered_df = filtered_df.sort_index()
    
    weekly_sales = filtered_df.groupby(pd.Grouper(freq='W-THU')).sum()
    weekly_sales.rename(columns = {'DailySales': 'WeeklySales'}, inplace = True)
    weekly_sales = create_lag(weekly_sales)
    weekly_sales['month'] = weekly_sales.index.month
    weekly_sales['week'] = weekly_sales.index.map(week_of_month)
    
    item_promos = get_promos(item_code)
    discounts = []
    amounts = []
    last_discount = (0, 0)
    for week in weekly_sales.index:
        if week in item_promos.index:
            row = item_promos.loc[week]
            discounts.append(row.discount)
            amounts.append(row.amount)
            last_discount = (row.discount, row.amount)
        else:
            discounts.append(0)
            amounts.append(0)
    weekly_sales['discount_perc'] = discounts
    weekly_sales['discount_amount'] = amounts
            
    train_x = weekly_sales.loc[:, weekly_sales.columns != 'WeeklySales']
    train_y = weekly_sales['WeeklySales']
    return filtered_df, weekly_sales
    

In [None]:
item = 123307
daily, weekly = data_for_item_sales(item)

fig, axs = plt.subplots(1, 2,figsize=(12, 4))
daily.plot(ax = axs[0], legend = None)
axs[0].set(xlabel = "", ylabel = "Daily Sales", title = "Daily sales of item {}".format(item))

weekly.WeeklySales.plot(ax = axs[1])
axs[1].set(xlabel = "", ylabel = "Weekly Sales", title = "Weekly sales of item {}".format(item))

In [None]:
promos = get_test_promos(val_df, item)
prediction, buffer = predict_for_one_item(item, promos)
plt.figure(figsize = (8, 4))
ax = weekly.WeeklySales.plot()
preds = pd.DataFrame({"Predicted":prediction,
                     "Actual": val_df[val_df.ItemCode == item].sort_values(by = ["Week"]).WeeklySales.values}, 
                     index = testing_range)
preds['Predicted + Buffer'] = preds.Predicted + buffer
preds.loc[training_range[-1]] = [weekly.loc[training_range[-1]].WeeklySales] * 3
preds = preds.sort_index()
preds.plot(ax = ax)
acc = round(accuracy(preds.Actual.values, preds.Predicted.values), 3)
under = round(under_forecast_mape(preds.Actual.values, preds.Predicted.values), 3)
plt.xlabel("")
plt.ylabel("Weekly Sales")
plt.title('''Weekly sales of item {} with predictions and buffer correction'''.format(item, acc, under))

In [None]:
error_summary(preds.Actual, preds.Predicted, preds['Predicted + Buffer'])

In [None]:
all_test_data = pd.concat([val_df, test_df])

In [None]:
def pred_for_category(catcode):
    total_sales = np.zeros((len(weekly), ), dtype = int)
    total_predicted = np.zeros((4, ), dtype = float)
    
    for item_code in df[df.CategoryCode == "category_" + str(catcode)].ItemCode.unique():
        promos = get_test_promos(all_test_data, item_code)
        weekly_sales, train_x, train_y = data_for_item(item_code)
        total_sales += weekly_sales.WeeklySales.values
        predictions, error, buf = train_predict(item_code, weekly_sales, train_x, train_y, promos)
        total_predicted += np.array(predictions, dtype = float)
        
    cat_sales = pd.DataFrame({'WeeklySales': total_sales}, index = weekly_sales.index)
    cat_pred = pd.DataFrame({'Predicted': total_predicted}, index = testing_range)
    cat_pred.loc[training_range[-1]] = cat_sales.loc[training_range[-1]].values
    cat_pred.sort_index(inplace = True)
    return cat_sales, cat_pred

In [None]:
cat_totals = {}
feature_importance = np.zeros(train_x.columns.shape)
for i in range(1, 5):
    cat_sales, cat_pred = pred_for_category(i)
    print(i)
    cat_totals[i] = cat_sales, cat_pred

In [None]:
def plot_cat(catax, catcode):
    cat_sales, cat_pred = cat_totals[catcode]
    avg_sales = np.mean(cat_sales.WeeklySales.values)
    cat_sales.plot(ax = catax, legend = None, label = 'Weekly Sales')
    cat_pred.plot(ax = catax, legend = None)
    catax.axhline(y = avg_sales, color = 'k', label = 'Average', ls = '--')
    return catax

In [None]:
fig, axs = plt.subplots(2, 2,figsize=(10, 6))
plot_cat(axs[0,0], 1)
axs[0,0].set_title("Category 1")
plot_cat(axs[0,1], 2)
axs[0,1].set_title("Category 2")
plot_cat(axs[1,0], 3)
axs[1,0].set_title("Category 3")
plot_cat(axs[1,1], 4)
axs[1,1].set_title("Category 4")
handles, labels = axs[0,0].get_legend_handles_labels()

plt.suptitle("Predicted total weekly sales for each category", fontweight='bold', fontsize = 14)
for ax in axs.flat:
    ax.set(xlabel='', ylabel='Total weekly sales')
fig.legend(handles, labels)
fig.tight_layout()

In [None]:
for item in val_df.ItemCode.unique():
    if last_discounts[item] != (0,0):
        print(item)

In [None]:
for item in df.ItemCode.unique():
    weekly, train_x, train_y = data_for_item(item)
    avg_sales = np.mean(train_y)
    plt.plot(weekly.discount_perc, train_y, 'bo')

In [None]:
len(promo_df.ItemCode.unique())

In [None]:
df_importances = pd.DataFrame({
        'feature': train_x.columns,
        'importance': feature_importance
    }).sort_values(by = 'importance', ascending = False)
    
    #plot variable importances of the model
plt.figure()
plt.title('Importances of engineered features', fontsize=14)
sns.barplot(x=df_importances.importance, y=df_importances.feature, palette = sns.color_palette("husl", 8))
plt.xlabel("Importance")
plt.ylabel("Feature name")

In [13]:
total_range = pd.date_range(training_week_range[2], testing_range[-1], freq = 'W-THU')
def count_promo_items(catcode):
    
    count_df = pd.DataFrame({'Items': [0] * len(total_range)}, index = total_range)
    for item_code in df[df.CategoryCode == "category_" + str(catcode)].ItemCode.unique():
        train_promos = get_promos(item_code)
        for week in train_promos.index:
            if week in count_df.index:
                count_df.Items.loc[week] += 1
         
    return count_df

In [14]:
def plot_cat_promo(catax, catcode):
    count_promo_items(catcode).plot(ax = catax, legend = False, style = 'm')
    return catax

In [19]:

fig, axs = plt.subplots(2, 2,figsize=(10, 4), sharey = True)
plot_cat_promo(axs[0,0], 1)
axs[0,0].set_title("Category 1")
plot_cat_promo(axs[0,1], 2)
axs[0,1].set_title("Category 2")
plot_cat_promo(axs[1,0], 3)
axs[1,0].set_title("Category 3")
plot_cat_promo(axs[1,1], 4)
axs[1,1].set_title("Category 4")
handles, labels = axs[0,0].get_legend_handles_labels()
plt.locator_params(axis = 'y', integer = True)

plt.suptitle("Number of items on promotion per category", fontweight='bold', fontsize = 14)
for ax in axs.flat:
    ax.set(xlabel='', ylabel='Number of items')
fig.tight_layout()