In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-storm-30/validation_data.csv
/kaggle/input/data-storm-30/train_data.csv
/kaggle/input/data-storm-30/test_data.csv
/kaggle/input/data-storm-30/DataStorm3.0_KaggleProblemStatement.pdf


In [2]:
df = pd.read_csv("/kaggle/input/data-storm-30/train_data.csv")


In [3]:
pd.set_option('display.max_rows', 10)

In [4]:
training_range = pd.date_range('2021-10-01', '2022-02-13')
testing_range = pd.date_range('2022-02-20', '2022-03-13', freq='W-SUN')

In [5]:
def week_of_month(sunday):
    return (sunday.day - 1) // 7 + 1

In [6]:
def data_for_item(item_code):
    filtered_df = df.loc[df['ItemCode'] == item_code]
    filtered_df["DateID"] = pd.to_datetime(filtered_df["DateID"], format = "%m/%d/%Y")
        
    filtered_df.set_index(filtered_df.DateID, inplace=True)
    filtered_df.drop('DateID', axis=1, inplace=True)
    filtered_df.drop('ItemCode', axis = 1, inplace = True)
    filtered_df.drop('CategoryCode', axis = 1, inplace = True)
    
    average_sales = 0.5*sum(filtered_df.DailySales) / len(filtered_df)
    for date in training_range:
        if date not in filtered_df.index:
            filtered_df.loc[date] = [0]
    
    filtered_df = filtered_df.sort_index()
        
    weekly_sales = filtered_df.groupby(pd.Grouper(freq='W')).sum()
    weekly_sales.rename(columns = {'DailySales': 'WeeklySales'}, inplace = True)
    weekly_sales = create_lag(weekly_sales)
    weekly_sales['month'] = weekly_sales.index.month
    weekly_sales['week'] = weekly_sales.index.map(week_of_month)
    train_x = weekly_sales.loc[:, weekly_sales.columns != 'WeeklySales']
    train_y = weekly_sales['WeeklySales']
    return weekly_sales, train_x, train_y
    

In [7]:
def create_lag(df3):
    dataframe = pd.DataFrame()
    for i in range(2, 0, -1):
        dataframe['t-' + str(i)] = df3.WeeklySales.shift(i)
    df4 = pd.concat([df3, dataframe], axis=1)
    df4.dropna(inplace=True)
    return df4

In [8]:
weekly_sales, train_x, train_y = data_for_item(1074823)
weekly_sales

Unnamed: 0_level_0,WeeklySales,t-2,t-1,month,week
DateID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-17,365,60.0,227.0,10,3
2021-10-24,398,227.0,365.0,10,4
2021-10-31,303,365.0,398.0,10,5
2021-11-07,121,398.0,303.0,11,1
2021-11-14,345,303.0,121.0,11,2
...,...,...,...,...,...
2022-01-16,59,0.0,36.0,1,3
2022-01-23,0,36.0,59.0,1,4
2022-01-30,9,59.0,0.0,1,5
2022-02-06,79,0.0,9.0,2,1


In [9]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [10]:
def train_predict(weekly_sales, train_x, train_y):
    model = RandomForestRegressor(n_estimators=6, max_depth=10, random_state = 10)

    fit = model.fit(train_x, train_y)
    results = []
    
    train_pred = fit.predict(train_x)
    train_error = accuracy(train_y.values, train_pred)
    
    for date in testing_range:
        prev_row = weekly_sales.iloc[-1]
        test_x = pd.DataFrame({'month': [date.month], 
                               'week': [date.weekofyear], 
                               't-1':[prev_row['WeeklySales']], 
                               't-2': [prev_row['t-1']]})
        
        prediction = round(fit.predict(test_x)[0])
        test_x['WeeklySales'] = [prediction]
        test_x.index = [date]
        #print(test_x)
        weekly_sales = weekly_sales.append(test_x)
        results.append(prediction)
    
    #print(weekly_sales)
    return results, train_error
        

In [11]:
def predict_for_one_item(item_code):
    weekly_sales, train_x, train_y = data_for_item(item_code)
    result, error = train_predict(weekly_sales, train_x, train_y)
    print("Item {} Training error: {}".format(item_code, error))
    return result

In [12]:
def accuracy(actual, pred):
    diff = np.sum(np.abs(actual - pred))
    return diff / np.sum(actual)

In [13]:
def predict_for_test(path):
    val_df = pd.read_csv(path)
    results = {}
    for item_code in val_df.ItemCode.unique():
        #print(item_code)
        sales = predict_for_one_item(item_code)
        weeks = {}
        for i in range(4):
            weeks['w'+str(i+1)] = sales[i]
        results[item_code] = weeks
    val_df['Predictions'] = val_df.apply(lambda row: results[row.ItemCode][row.Week], axis = 1)
    val_df['ID'] = val_df.apply(lambda row: row.CategoryCode + '_' + str(row.ItemCode) + '_' + row.Week, axis = 1)
    
    if 'WeeklySales' in val_df:
        print("Accuracy:", accuracy(val_df.WeeklySales.values, val_df.Predictions.values))
    return val_df
    

In [14]:
val_df = predict_for_test('/kaggle/input/data-storm-30/validation_data.csv')

Item 1044502 Training error: 0.10493827160493827
Item 1105009 Training error: 0.21374045801526717
Item 913561 Training error: 0.16770186335403728
Item 1048975 Training error: 0.18195718654434248
Item 17287 Training error: 0.22723253757736514
Item 371239 Training error: 0.25283446712018137
Item 1098502 Training error: 0.1550724637681159
Item 1074823 Training error: 0.1830716200689391
Item 23569 Training error: 0.1940700808625337
Item 397213 Training error: 0.19919919919919923
Item 211309 Training error: 0.1993464052287582
Item 1058713 Training error: 0.10626398210290826
Item 1032550 Training error: 0.2019089574155654
Item 1071106 Training error: 0.11477272727272728
Item 40759 Training error: 0.1402714932126697
Item 1067092 Training error: 0.11884291599498475
Item 1101661 Training error: 0.16352694924123498
Item 210868 Training error: 0.18150684931506847
Item 379249 Training error: 0.1916983523447402
Item 75886 Training error: 0.18574297188755018
Item 1044682 Training error: 0.1294661622

In [15]:
val_df

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales,Predictions,ID
0,category_2,1044502,w1,11,14,category_2_1044502_w1
1,category_2,1105009,w1,11,5,category_2_1105009_w1
2,category_2,913561,w4,5,10,category_2_913561_w4
3,category_1,1048975,w4,30,18,category_1_1048975_w4
4,category_1,17287,w2,60,26,category_1_17287_w2
...,...,...,...,...,...,...
365,category_2,124954,w2,43,27,category_2_124954_w2
366,category_2,40759,w1,48,69,category_2_40759_w1
367,category_1,1090303,w1,19,9,category_1_1090303_w1
368,category_2,1090276,w3,6,13,category_2_1090276_w3


In [16]:
test_df = predict_for_test('/kaggle/input/data-storm-30/test_data.csv')

Item 43738 Training error: 0.14780835881753313
Item 1006090 Training error: 0.18371212121212122
Item 1076929 Training error: 0.3787128712871287
Item 1081321 Training error: 0.19811320754716982
Item 216151 Training error: 0.08441558441558442
Item 1063600 Training error: 0.12859560067681897
Item 1082743 Training error: 0.44195250659630597
Item 23200 Training error: 0.22114197530864196
Item 1068883 Training error: 0.197463768115942
Item 838456 Training error: 0.09246954595791806
Item 1090258 Training error: 0.2640086206896552
Item 1032559 Training error: 0.13835470085470086
Item 123307 Training error: 0.09728357746260559
Item 837943 Training error: 0.09123146357188915
Item 1067119 Training error: 0.1259899208063355
Item 1056463 Training error: 0.11684981684981682
Item 1101571 Training error: 0.4035555555555555
Item 1101769 Training error: 0.21879936808846764
Item 687616 Training error: 0.13401360544217691
Item 1071115 Training error: 0.22605363984674326
Item 1081339 Training error: 0.2932

In [17]:
test_df

Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales,Predictions,ID
0,category_1,43738,w4,,13,category_1_43738_w4
1,category_2,1006090,w1,,24,category_2_1006090_w1
2,category_2,1076929,w4,,42,category_2_1076929_w4
3,category_1,1081321,w3,,14,category_1_1081321_w3
4,category_2,216151,w4,,25,category_2_216151_w4
...,...,...,...,...,...,...
372,category_2,1101571,w1,,40,category_2_1101571_w1
373,category_2,1090258,w4,,71,category_2_1090258_w4
374,category_2,906595,w1,,10,category_2_906595_w1
375,category_2,32245,w1,,45,category_2_32245_w1


In [18]:
submission_df = pd.DataFrame({'ID': test_df['ID'], 'WeeklySales': test_df['Predictions']})
submission_df.to_csv('submission11.csv', index=False)

In [19]:
submission_df

Unnamed: 0,ID,WeeklySales
0,category_1_43738_w4,13
1,category_2_1006090_w1,24
2,category_2_1076929_w4,42
3,category_1_1081321_w3,14
4,category_2_216151_w4,25
...,...,...
372,category_2_1101571_w1,40
373,category_2_1090258_w4,71
374,category_2_906595_w1,10
375,category_2_32245_w1,45
