In [209]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


%matplotlib inline

In [210]:
walmart = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')
testing = pd.read_csv('test.csv')

In [211]:
merged = walmart.merge(stores, how='left').merge(features, how='left')
testing_merged = testing.merge(stores, how='left').merge(features, how='left')

In [212]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = (df.Date.dt.isocalendar().week)*1.0   
    
split_date(merged) 
split_date(testing_merged)

In [213]:
storetype_values = {'A':3, 'B':2, 'C':1}
merged['Type_Numeric'] = merged.Type.map(storetype_values)
testing_merged['Type_Numeric'] = testing_merged.Type.map(storetype_values)

merged['IsHoliday'] = merged['IsHoliday'].apply(lambda x:1 if x == True else 0)
testing_merged['IsHoliday'] = merged['IsHoliday'].apply(lambda x:1 if x == True else 0)

In [214]:
merged.fillna(0,inplace = True)
testing_merged['CPI'].fillna(testing_merged['CPI'].mean, inplace = True)
testing_merged['Unemployment'].fillna(testing_merged['Unemployment'].mean, inplace = True)
testing_merged['MarkDown1'].fillna(0, inplace = True)
testing_merged['MarkDown2'].fillna(0, inplace = True)
testing_merged['MarkDown3'].fillna(0, inplace = True)
testing_merged['MarkDown4'].fillna(0, inplace = True)

In [215]:
merged_1 = merged.drop(['Date', 'Temperature','Fuel_Price', 'Type', 'MarkDown1','MarkDown2','MarkDown3',
             'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Month', 'Day'], axis=1)

testing_1 = testing_merged.drop(['Date', 'Temperature','Fuel_Price', 'Type', 'MarkDown1','MarkDown2','MarkDown3',
             'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Month', 'Day'], axis=1)

In [216]:
input_cols = merged_1.columns.to_list()
input_cols.remove('Weekly_Sales')
target_col = 'Weekly_Sales'

inputs_df = merged_1[input_cols].copy()
targets = merged_1[target_col].copy()

In [217]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(inputs_df[input_cols])

inputs_df[input_cols] = scaler.transform(inputs_df[input_cols])
testing_1[input_cols] = scaler.transform(testing_merged[input_cols])

In [218]:
inputs_df.head(20)

Unnamed: 0,Store,Dept,IsHoliday,Size,Year,WeekOfYear,Type_Numeric
0,0.0,0.0,0.0,0.630267,0.0,0.078431,1.0
1,0.0,0.0,1.0,0.630267,0.0,0.098039,1.0
2,0.0,0.0,0.0,0.630267,0.0,0.117647,1.0
3,0.0,0.0,0.0,0.630267,0.0,0.137255,1.0
4,0.0,0.0,0.0,0.630267,0.0,0.156863,1.0
5,0.0,0.0,0.0,0.630267,0.0,0.176471,1.0
6,0.0,0.0,0.0,0.630267,0.0,0.196078,1.0
7,0.0,0.0,0.0,0.630267,0.0,0.215686,1.0
8,0.0,0.0,0.0,0.630267,0.0,0.235294,1.0
9,0.0,0.0,0.0,0.630267,0.0,0.254902,1.0


In [219]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs_df, targets, test_size=0.3, random_state=42)

In [220]:
def WMAE(df, targets, predictions):
    weights = df.IsHoliday.apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights*abs(targets-predictions))/(np.sum(weights)), 2)

In [221]:
import time
def evaluate_runtime_model (model, pred_df, target_df):
    run_time = []
    for _ in range(10):
        start_time = time.time()
        model.fit(pred_df,target_df)
        model_result = model.predict(pred_df)
        end_time = time.time()
        run_time.append(end_time - start_time)
        
        WMAEs = WMAE(pred_df,target_df,model_result)
        
        run_time = np.mean(run_time)
        
        results = {'time': run_time, 'Weighted mean absolute error': WMAEs}
        
        return run_time,WMAEs

In [222]:
from sklearn.ensemble import RandomForestRegressor


rf1 = RandomForestRegressor(n_jobs=-1, random_state=42)

rf1.fit(train_inputs, train_targets)
print(evaluate_runtime_model(rf1,val_inputs,val_targets))

(6.543325424194336, 673.43)


In [223]:
rf2 = RandomForestRegressor(n_estimators = 92, max_features = 7,
                            max_depth = 110, bootstrap = True,n_jobs=-1, random_state=42)

rf2.fit(train_inputs, train_targets)
print(evaluate_runtime_model(rf2,val_inputs,val_targets))

(5.481006383895874, 673.58)


In [224]:
rf2

RandomForestRegressor(max_depth=110, max_features=7, n_estimators=92, n_jobs=-1,
                      random_state=42)

In [225]:
rf3 = RandomForestRegressor(n_estimators = 130, max_features = 7,
                            max_depth = 85, bootstrap = True,n_jobs=-1, random_state=42)

rf3.fit(train_inputs, train_targets)
print(evaluate_runtime_model(rf3,val_inputs,val_targets))

(14.704982042312622, 670.86)


In [226]:
rf4 = RandomForestRegressor(n_estimators = 80, max_features = 7, bootstrap = True,n_jobs=-1, random_state=42)

rf4.fit(train_inputs, train_targets)
print(evaluate_runtime_model(rf3,val_inputs,val_targets))

(10.392449617385864, 670.86)


In [227]:
test_predit = rf2.predict(testing_1)
testing_merged['Weekly_Sales'] = test_predit
submission = pd.read_csv('sampleSubmission.csv')


submission['Weekly_Sales'] = test_predit
submission.to_csv('submission.csv',index=False)