In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [18]:
df = pd.read_csv("/kaggle/input/data-storm-30/train_data.csv")


In [19]:
pd.set_option('display.max_rows', 10)

In [20]:
training_range = pd.date_range('2021-10-01', '2022-02-13')
testing_range = pd.date_range('2022-02-20', '2022-03-13', freq='W-SUN')

In [21]:
def week_of_month(sunday):
    return (sunday.day - 1) // 7 + 1

In [22]:
def data_for_item(item_code):
    filtered_df = df.loc[df['ItemCode'] == item_code]
    filtered_df["DateID"] = pd.to_datetime(filtered_df["DateID"], format = "%m/%d/%Y")
        
    filtered_df.set_index(filtered_df.DateID, inplace=True)
    filtered_df.drop('DateID', axis=1, inplace=True)
    filtered_df.drop('ItemCode', axis = 1, inplace = True)
    filtered_df.drop('CategoryCode', axis = 1, inplace = True)
    
    average_sales = 0.5*sum(filtered_df.DailySales) / len(filtered_df)
    for date in training_range:
        if date not in filtered_df.index:
            filtered_df.loc[date] = [0]
    
    filtered_df = filtered_df.sort_index()
        
    weekly_sales = filtered_df.groupby(pd.Grouper(freq='W')).sum()
    weekly_sales.rename(columns = {'DailySales': 'WeeklySales'}, inplace = True)
    weekly_sales = create_lag(weekly_sales)
    weekly_sales['month'] = weekly_sales.index.month
    weekly_sales['week'] = weekly_sales.index.map(week_of_month)
    train_x = weekly_sales.loc[:, weekly_sales.columns != 'WeeklySales']
    train_y = weekly_sales['WeeklySales']
    return weekly_sales, train_x, train_y
    

In [23]:
def create_lag(df3):
    dataframe = pd.DataFrame()
    for i in range(2, 0, -1):
        dataframe['t-' + str(i)] = df3.WeeklySales.shift(i)
    df4 = pd.concat([df3, dataframe], axis=1)
    df4.dropna(inplace=True)
    return df4

In [24]:
weekly_sales, train_x, train_y = data_for_item(1074823)
weekly_sales

In [25]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [26]:
def train_predict(weekly_sales, train_x, train_y):
    model = RandomForestRegressor(n_estimators=6, max_depth=10, random_state = 10)

    fit = model.fit(train_x, train_y)
    results = []
    
    train_pred = fit.predict(train_x)
    train_error = accuracy(train_y.values, train_pred)
    
    for date in testing_range:
        prev_row = weekly_sales.iloc[-1]
        test_x = pd.DataFrame({'month': [date.month], 
                               'week': [date.weekofyear], 
                               't-1':[prev_row['WeeklySales']], 
                               't-2': [prev_row['t-1']]})
        
        prediction = round(fit.predict(test_x)[0])
        test_x['WeeklySales'] = [prediction]
        test_x.index = [date]
        #print(test_x)
        weekly_sales = weekly_sales.append(test_x)
        results.append(prediction)
    
    #print(weekly_sales)
    return results, train_error
        

In [27]:
def predict_for_one_item(item_code):
    weekly_sales, train_x, train_y = data_for_item(item_code)
    result, error = train_predict(weekly_sales, train_x, train_y)
    print("Item {} Training error: {}".format(item_code, error))
    return result

In [28]:
def accuracy(actual, pred):
    diff = np.sum(np.abs(actual - pred))
    return diff / np.sum(actual)

In [29]:
def predict_for_test(path):
    val_df = pd.read_csv(path)
    results = {}
    for item_code in val_df.ItemCode.unique():
        #print(item_code)
        sales = predict_for_one_item(item_code)
        weeks = {}
        for i in range(4):
            weeks['w'+str(i+1)] = sales[i]
        results[item_code] = weeks
    val_df['Predictions'] = val_df.apply(lambda row: results[row.ItemCode][row.Week], axis = 1)
    val_df['ID'] = val_df.apply(lambda row: row.CategoryCode + '_' + str(row.ItemCode) + '_' + row.Week, axis = 1)
    
    if 'WeeklySales' in val_df:
        print("Accuracy:", accuracy(val_df.WeeklySales.values, val_df.Predictions.values))
    return val_df
    

In [30]:
val_df = predict_for_test('/kaggle/input/data-storm-30/validation_data.csv')

In [31]:
val_df

In [32]:
test_df = predict_for_test('/kaggle/input/data-storm-30/test_data.csv')

In [33]:
test_df

In [34]:
submission_df = pd.DataFrame({'ID': test_df['ID'], 'WeeklySales': test_df['Predictions']})
submission_df.to_csv('submission11.csv', index=False)

In [35]:
submission_df