In [98]:
import  pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [72]:
sales_points = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\SalesPointsV2.csv")

In [73]:
sales_points

Unnamed: 0.1,Unnamed: 0,ItemCode,#sales,stage,WeeklySales
0,0,3418,21,validation,"[29, 42, 41, 41, 44, 46, 43, 49, 54, 69, 101, ..."
1,1,3427,21,validation,"[11, 40, 20, 48, 54, 28, 36, 52, 42, 57, 46, 4..."
2,2,7666,21,validation,"[84, 119, 196, 185, 94, 15, 10, 30, 79, 82, 14..."
3,3,9925,21,test,"[1, 9, 11, 13, 5, 11, 11, 13, 12, 10, 21, 15, ..."
4,4,16936,21,test,"[9, 25, 40, 31, 21, 43, 7, 34, 151, 32, 28, 25..."
...,...,...,...,...,...
189,189,1101769,21,test,"[8, 3, 0, 0, 18, 20, 22, 17, 14, 1, 0, 2, 0, 0..."
190,190,1103056,19,test,"[8, 12, 28, 27, 20, 17, 24, 19, 12, 7, 18, 11,..."
191,191,1105009,18,validation,"[5, 5, 10, 17, 17, 4, 6, 9, 4, 7, 2, 6, 6, 7, ..."
192,192,1105018,17,test,"[3, 6, 11, 4, 7, 8, 6, 7, 10, 12, 4, 10, 10, 1..."


In [74]:
from statsmodels.tsa.statespace.sarimax import SARIMAX


def build_arima_model(lis_data,result,order=(5,0,0),show_fig=False,show_summary=False):
    model = SARIMAX(lis_data, order=order)
    model_fit = model.fit()
    if show_summary:
        # summary of fit model
        print(model_fit.summary())

    if show_fig:
        # line plot of residuals
        residuals = pd.DataFrame(model_fit.resid)
        residuals.plot()
        plt.show()
        # density plot of residuals
        residuals.plot(kind='kde')
        plt.show()
        # summary stats of residuals
        print(residuals.describe())

    result["arima_model"] = model_fit
    return result

In [75]:
selected_data_points = sales_points

In [76]:
models_list = {}
for i in tqdm(selected_data_points.index):
    result = dict()
    product_series = sales_points.iloc[i]
    result["input_data"] = product_series
    __data = eval(product_series["WeeklySales"])
    build_arima_model(__data,result,show_fig=False,show_summary=False)
    model = result["arima_model"]
    fc = model.forecast(4,alpha=0.05)
    fc_series = pd.Series(fc)
    result["week_prediction"] = fc_series
    models_list[product_series["ItemCode"]] = result

  warn('Non-stationary starting autoregressive parameters'
100%|██████████| 194/194 [00:12<00:00, 15.95it/s]


In [77]:
def __day_filter_values(data):
    output = {
        "ItemCode":data["input_data"]["ItemCode"]
    }

    __pred = data["week_prediction"]

    output["w1"] = __pred[0]
    output["w2"] = __pred[1]
    output["w3"] = __pred[2]
    output["w4"] = __pred[3]

    return output

def day_series_filter_out(model_data):
    output_df = {}
    for __key in tqdm(model_data.keys()):
        output_df[__key] = __day_filter_values(model_data[__key])

    output_df = pd.DataFrame.from_dict(output_df,orient='index')
    output_df = pd.melt(output_df,id_vars=["ItemCode"],value_vars=["w1","w2","w3","w4"])
    output_df.columns = ["ItemCode","Week","WeeklySales"]
    return output_df

In [78]:
out_df = day_series_filter_out(models_list)

100%|██████████| 194/194 [00:00<00:00, 27731.41it/s]


In [79]:
out_df

Unnamed: 0,ItemCode,Week,WeeklySales
0,3418,w1,30.058333
1,3427,w1,26.496877
2,7666,w1,37.996309
3,9925,w1,10.353285
4,16936,w1,47.482897
...,...,...,...
771,1101769,w4,21.014883
772,1103056,w4,8.155689
773,1105009,w4,5.731597
774,1105018,w4,5.538532


In [80]:
out_df.loc[out_df["ItemCode"] == 3427]

Unnamed: 0,ItemCode,Week,WeeklySales
1,3427,w1,26.496877
195,3427,w2,23.920043
389,3427,w3,21.951529
583,3427,w4,27.440648


In [92]:
validation_df = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\validation_data.csv")

In [101]:
def select_val_item_codes(df):
    validation_item_codes = list(set(pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\validation_data.csv")["ItemCode"].tolist()))
    df=df.loc[df["ItemCode"].isin(validation_item_codes)]
    return df

def evaluate_model_arima(true_df:pd.DataFrame,pred_df:pd.DataFrame):
    pred_df = pred_df.sort_values(['ItemCode',"Week"])
    true_df = true_df.sort_values(['ItemCode',"Week"])

    merged =pd.merge(true_df,pred_df,on=["ItemCode","Week"],how="inner")
    merged["WeeklySales_y"] = merged["WeeklySales_y"].astype(int)

    abs_error = mean_absolute_percentage_error(merged["WeeklySales_x"],merged["WeeklySales_y"])

    return abs_error

In [102]:
evaluate_model_arima(validation_df,select_val_item_codes(out_df))

0.6920449286400434

In [104]:
out_df["WeeklySales"] = out_df["WeeklySales"].astype(int)
out_df.to_csv("output_Lol.csv",index=False)

In [108]:
test_df = pd.read_csv("D:\\Competitions\\DataStorm-2022\\data\\test_data.csv")

In [111]:
test_df

Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales
0,category_1,43738,w4,
1,category_2,1006090,w1,
2,category_2,1076929,w4,
3,category_1,1081321,w3,
4,category_2,216151,w4,
...,...,...,...,...
372,category_2,1101571,w1,
373,category_2,1090258,w4,
374,category_2,906595,w1,
375,category_2,32245,w1,


In [112]:
def create_submission_arima(pred_df:pd.DataFrame,attach_df:pd.DataFrame):
    pred_df = pred_df.sort_values(['ItemCode',"Week"])
    true_df = attach_df.sort_values(['ItemCode',"Week"])

    merged =pd.merge(true_df,pred_df,on=["ItemCode","Week"],how="inner")
    merged= merged[["CategoryCode","ItemCode","Week","WeeklySales"]]
    return merged

In [113]:
create_submission_arima(out_df,test_df)

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales
0,category_2,9925,w1,10
1,category_2,9925,w2,9
2,category_2,9925,w3,8
3,category_2,9925,w4,13
4,category_3,16936,w1,47
...,...,...,...,...
372,category_2,1105018,w4,5
373,category_2,1105027,w1,10
374,category_2,1105027,w2,6
375,category_2,1105027,w3,13


In [81]:
#out_df.to_csv("out_put1.csv",index=False)