In [1]:
import numpy as np
import pandas as pd
import plotly as plt
import matplotlib
import seaborn as sns


pd.set_option('display.max_columns', None)


In [2]:

store_df = pd.read_csv('store.csv',low_memory=False)
test_df = pd.read_csv('test.csv',low_memory=False)
train_df = pd.read_csv('train.csv',low_memory=False)


In [3]:
merged_df = pd.merge(train_df,store_df,how='left',on='Store')
merged_df_test = test_df.merge(store_df,how='left',on='Store')

In [4]:
merged_df['Date'].min() , merged_df['Date'].max() , merged_df_test['Date'].min(),merged_df_test['Date'].max()

('2013-01-01', '2015-07-31', '2015-08-01', '2015-09-17')

In [5]:
def split_date(df):
    df['Date']=pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    
split_date(merged_df)
split_date(merged_df_test)

In [6]:
merged_df[merged_df['Open']==0]

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear
291,292,5,2015-07-31,0,0,0,1,0,1,a,a,1100.0,6.0,2009.0,0,,,,2015,7,31,31
875,876,5,2015-07-31,0,0,0,1,0,1,a,a,21790.0,4.0,2005.0,1,18.0,2015.0,"Feb,May,Aug,Nov",2015,7,31,31
1406,292,4,2015-07-30,0,0,0,1,0,1,a,a,1100.0,6.0,2009.0,0,,,,2015,7,30,31
1990,876,4,2015-07-30,0,0,0,1,0,1,a,a,21790.0,4.0,2005.0,1,18.0,2015.0,"Feb,May,Aug,Nov",2015,7,30,31
2521,292,3,2015-07-29,0,0,0,1,0,1,a,a,1100.0,6.0,2009.0,0,,,,2015,7,29,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,1111,2,2013-01-01,0,0,0,0,a,1,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct",2013,1,1,1
1017205,1112,2,2013-01-01,0,0,0,0,a,1,c,c,1880.0,4.0,2006.0,0,,,,2013,1,1,1
1017206,1113,2,2013-01-01,0,0,0,0,a,1,a,c,9260.0,,,0,,,,2013,1,1,1
1017207,1114,2,2013-01-01,0,0,0,0,a,1,a,c,870.0,,,0,,,,2013,1,1,1


In [7]:
merged_df =merged_df[merged_df['Open']==1].copy()

In [8]:
def comp_months(df):
    df['CompetitionOpen'] = 12 * (df['Year'] - df['CompetitionOpenSinceYear']) + df['CompetitionOpenSinceMonth']
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x <0 else x).fillna(0)

In [9]:
comp_months(merged_df)
comp_months(merged_df_test)
merged_df.CompetitionOpen.value_counts()

CompetitionOpen
0.0       312024
33.0       14404
21.0       14201
45.0       10698
69.0        8693
           ...  
204.0        175
211.0        174
197.0        174
1381.0       174
1369.0       147
Name: count, Length: 207, dtype: int64

In [26]:
def check_promo_month(row):
    month2str = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sept',10:'Oct',11:'Nov',12:'Dec'}
    
    try: 
        months= (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else: 
            return 0
    except Exception:
        return 0 

In [30]:
def promo_cols(df):
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) + (df.WeekOfYear-df.Promo2SinceWeek)/4.25
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x<0 else x).fillna(0) * df['Promo2']
    df['IsPromo2Month'] = df.apply(check_promo_month,axis=1) * df['Promo2']

In [50]:
promo_cols(merged_df)
promo_cols(merged_df_test)

In [42]:
merged_df.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'Promo2Open', 'IsPromo2Month'],
      dtype='object')

In [None]:
inputs_cols = ['Store', 'DayOfWeek','StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance','CompetitionOpenSinceYear','Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'Promo2Open', 'IsPromo2Month']

In [45]:
target_col = 'Sales'

In [47]:
input = merged_df[inputs_cols].copy()
target = merged_df[target_col].copy()

In [48]:
test_inputs = merged_df_test[inputs_cols].copy()

In [61]:
numeric_cols = ['Store','CompetitionDistance','Year', 'Month', 'Day',
       'WeekOfYear', 'CompetitionOpen', 'Promo2Open', 'IsPromo2Month']
cat_cols = ['DayOfWeek','StateHoliday','StoreType','Assortment']

In [62]:
input[cat_cols].isna().sum() # 0
test_inputs[numeric_cols].isna().sum()
input[numeric_cols].isna().sum()

Store                     0
CompetitionDistance    2186
Year                      0
Month                     0
Day                       0
WeekOfYear                0
CompetitionOpen           0
Promo2Open                0
IsPromo2Month             0
dtype: int64

In [64]:
max_distance=input.CompetitionDistance.max()
max_distance

np.float64(75860.0)

In [69]:
pd.set_option('future.no_silent_downcasting', True)

In [70]:
input['CompetitionDistance']=input['CompetitionDistance'].fillna(max_distance)
test_inputs['CompetitionDistance']=test_inputs['CompetitionDistance'].fillna(max_distance)