In [403]:
import  pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [404]:
sales_points = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\SalesPointsV2.csv")

In [405]:
sales_points

Unnamed: 0,ItemCode,#sales,stage,WeeklySales
0,3418,21,validation,"[29, 42, 41, 41, 44, 46, 43, 49, 54, 69, 101, ..."
1,3427,21,validation,"[11, 40, 20, 48, 54, 28, 36, 52, 42, 57, 46, 4..."
2,7666,21,validation,"[84, 119, 196, 185, 94, 15, 10, 30, 79, 82, 14..."
3,9925,21,test,"[1, 9, 11, 13, 5, 11, 11, 13, 12, 10, 21, 15, ..."
4,16936,21,test,"[9, 25, 40, 31, 21, 43, 7, 34, 151, 32, 28, 25..."
...,...,...,...,...
189,1101769,21,test,"[8, 3, 0, 0, 18, 20, 22, 17, 14, 1, 0, 2, 0, 0..."
190,1103056,19,test,"[8, 12, 28, 27, 20, 17, 24, 19, 12, 7, 18, 11,..."
191,1105009,18,validation,"[5, 5, 10, 17, 17, 4, 6, 9, 4, 7, 2, 6, 6, 7, ..."
192,1105018,17,test,"[3, 6, 11, 4, 7, 8, 6, 7, 10, 12, 4, 10, 10, 1..."


In [406]:
from statsmodels.tsa.statespace.sarimax import SARIMAX


def build_arima_model(lis_data,result,order=(4,0,0),show_fig=False,show_summary=False):
    model = SARIMAX(lis_data,order=order)
    model_fit = model.fit()
    if show_summary:
        # summary of fit model
        print(model_fit.summary())

    if show_fig:
        # line plot of residuals
        residuals = pd.DataFrame(model_fit.resid)
        residuals.plot()
        plt.show()
        # density plot of residuals
        residuals.plot(kind='kde')
        plt.show()
        # summary stats of residuals
        print(residuals.describe())

    result["arima_model"] = model_fit
    return result

In [407]:
selected_data_points = sales_points

In [408]:
models_list = {}
for i in tqdm(selected_data_points.index):
    result = dict()
    product_series = sales_points.iloc[i]
    result["input_data"] = product_series
    __data = eval(product_series["WeeklySales"])
    build_arima_model(__data[-20:],result,show_fig=False,show_summary=False)
    model = result["arima_model"]
    fc = model.forecast(4)
    fc_series = pd.Series(fc)
    result["week_prediction"] = fc_series
    models_list[product_series["ItemCode"]] = result

  warn('Non-stationary starting autoregressive parameters'
100%|██████████| 194/194 [00:09<00:00, 20.35it/s]


In [409]:
def __day_filter_values(data):
    output = {
        "ItemCode":data["input_data"]["ItemCode"]
    }

    __pred = data["week_prediction"]

    output["w1"] = __pred[0]
    output["w2"] = __pred[1]
    output["w3"] = __pred[2]
    output["w4"] = __pred[3]

    return output

def day_series_filter_out(model_data):
    output_df = {}
    for __key in tqdm(model_data.keys()):
        output_df[__key] = __day_filter_values(model_data[__key])

    output_df = pd.DataFrame.from_dict(output_df,orient='index')
    output_df = pd.melt(output_df,id_vars=["ItemCode"],value_vars=["w1","w2","w3","w4"])
    output_df.columns = ["ItemCode","Week","WeeklySales"]
    return output_df

In [410]:
out_df = day_series_filter_out(models_list)

100%|██████████| 194/194 [00:00<00:00, 65341.28it/s]


In [411]:
out_df

Unnamed: 0,ItemCode,Week,WeeklySales
0,3418,w1,30.976874
1,3427,w1,32.693050
2,7666,w1,39.241415
3,9925,w1,9.269342
4,16936,w1,46.355568
...,...,...,...
771,1101769,w4,21.796874
772,1103056,w4,8.802559
773,1105009,w4,6.749126
774,1105018,w4,5.490899


In [412]:
out_df.loc[out_df["ItemCode"] == 3427]

Unnamed: 0,ItemCode,Week,WeeklySales
1,3427,w1,32.69305
195,3427,w2,28.520792
389,3427,w3,25.562632
583,3427,w4,31.92511


In [413]:
validation_df = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\validation_data.csv")

In [414]:
def select_val_item_codes(df):
    validation_item_codes = list(set(pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\validation_data.csv")["ItemCode"].tolist()))
    df=df.loc[df["ItemCode"].isin(validation_item_codes)]
    return df

def evaluate_model_arima(true_df:pd.DataFrame,pred_df:pd.DataFrame):
    pred_df = pred_df.sort_values(['ItemCode',"Week"])
    true_df = true_df.sort_values(['ItemCode',"Week"])

    merged =pd.merge(true_df,pred_df,on=["ItemCode","Week"],how="inner")
    merged["WeeklySales_y"] = merged["WeeklySales_y"].astype(int)

    abs_error = mean_absolute_percentage_error(merged["WeeklySales_x"],merged["WeeklySales_y"])

    return abs_error

In [415]:
evaluate_model_arima(validation_df,select_val_item_codes(out_df))

0.6226734617134216

In [416]:
out_df["WeeklySales"] = out_df["WeeklySales"].astype(int)
out_df.to_csv("output_Lol.csv",index=False)

In [417]:
test_df = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\test_data.csv")

In [418]:
test_df

Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales
0,category_1,43738,w4,
1,category_2,1006090,w1,
2,category_2,1076929,w4,
3,category_1,1081321,w3,
4,category_2,216151,w4,
...,...,...,...,...
372,category_2,1101571,w1,
373,category_2,1090258,w4,
374,category_2,906595,w1,
375,category_2,32245,w1,


In [419]:
def create_submission_arima(pred_df:pd.DataFrame,attach_df:pd.DataFrame):
    pred_df = pred_df.sort_values(['ItemCode',"Week"])
    true_df = attach_df.sort_values(['ItemCode',"Week"])

    merged =pd.merge(true_df,pred_df,on=["ItemCode","Week"],how="inner")
    merged= merged[["CategoryCode","ItemCode","Week","WeeklySales"]]
    # print(merged)
    full_df = merged.copy()
    merged['ID'] = merged.apply(lambda x: f"{x.CategoryCode}_{x.ItemCode}_{x.Week}", axis=1)

    merged = merged.loc[:, ['ID', 'WeeklySales']]
    return merged,full_df

In [420]:
submission_df,full_df = create_submission_arima(out_df,test_df)

In [421]:
#out_df.to_csv("out_put1.csv",index=False)

In [422]:
submission_df.to_csv("submission_0_6226.csv",index=False)

In [423]:
sales_poins_1 = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\SalesPointsV2.csv")
test_sales_point = sales_poins_1.loc[sales_poins_1["stage"] == "test"]

In [424]:
test_sales_point

Unnamed: 0,ItemCode,#sales,stage,WeeklySales
3,9925,21,test,"[1, 9, 11, 13, 5, 11, 11, 13, 12, 10, 21, 15, ..."
4,16936,21,test,"[9, 25, 40, 31, 21, 43, 7, 34, 151, 32, 28, 25..."
8,23200,21,test,"[8, 7, 33, 7, 64, 80, 61, 30, 108, 163, 195, 1..."
12,32245,21,test,"[26, 41, 24, 45, 28, 27, 29, 40, 29, 46, 38, 1..."
13,35449,21,test,"[2, 1, 3, 6, 6, 5, 7, 5, 18, 11, 27, 17, 20, 1..."
...,...,...,...,...
187,1101571,21,test,"[2, 7, 18, 7, 6, 7, 3, 8, 4, 3, 2, 2, 81, 87, ..."
189,1101769,21,test,"[8, 3, 0, 0, 18, 20, 22, 17, 14, 1, 0, 2, 0, 0..."
190,1103056,19,test,"[8, 12, 28, 27, 20, 17, 24, 19, 12, 7, 18, 11,..."
192,1105018,17,test,"[3, 6, 11, 4, 7, 8, 6, 7, 10, 12, 4, 10, 10, 1..."


In [425]:
__id = 16936

In [426]:
eval(test_sales_point.loc[test_sales_point["ItemCode"] == __id].iloc[0]["WeeklySales"])

[9,
 25,
 40,
 31,
 21,
 43,
 7,
 34,
 151,
 32,
 28,
 25,
 49,
 27,
 74,
 66,
 43,
 66,
 48,
 13,
 71]

In [427]:
full_df.loc[full_df["ItemCode"] == __id]

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales
4,category_3,16936,w1,46
5,category_3,16936,w2,31
6,category_3,16936,w3,39
7,category_3,16936,w4,40
