## Imports

In [2]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import os

In [3]:
def grouped_apply_agg(df, group, cols, names, functions):

    grouped = df.sort_values('date').groupby(group, sort=False)    

    for col in cols:
        for name, function in zip(names, functions):
            df[name + "_" + col] = grouped[col].apply(function).reset_index(level=[0,1,2], drop=True)

    return df


## Importing data

In [16]:
data_dir = "data"

info_db = pd.read_csv(os.path.join(data_dir, "train.csv"), index_col=0)
store_db = pd.read_csv(os.path.join(data_dir, "stores.csv"), index_col=0)
oil_db = pd.read_csv(os.path.join(data_dir, "oil.csv"))

holidays_db = pd.read_csv(os.path.join(data_dir, "holidays_events.csv"))

local_holidays_db = holidays_db[holidays_db['locale'] == 'Local']
local_holidays_db = holidays_db.rename(columns={'locale_name': 'city', 'type': 'h_type_loc', 'description': 'h_description_loc', 'transferred': 'h_transferred_loc'})
local_holidays_db = local_holidays_db.drop(columns=['locale'])
local_holidays_db = local_holidays_db.groupby(['date']).first().reset_index()

national_holidays_db = holidays_db[holidays_db['locale'] == 'National']
national_holidays_db = national_holidays_db.rename(columns={'type': 'h_type_nat', 'description': 'h_description_nat', 'transferred': 'h_transferred_nat'})
national_holidays_db = national_holidays_db.drop(columns=['locale', 'locale_name'])
national_holidays_db = national_holidays_db.groupby(['date']).first().reset_index()


transactions_db = pd.read_csv(os.path.join(data_dir, "transactions.csv"))

db = info_db.merge(store_db, on="store_nbr", how="left")

db = db.merge(oil_db, on="date", how="left")
db['dcoilwtico'] = db['dcoilwtico'].fillna(method='bfill')

db = db.merge(national_holidays_db, on="date", how="left")
db['h_type_nat'] = db['h_type_nat'].fillna('no_holiday')
db['h_description_nat'] = db['h_description_nat'].fillna('no_holiday')
db['h_transferred_nat'] = db['h_transferred_nat'].fillna('no_holiday')

db = db.merge(local_holidays_db, on=["date", 'city'], how="left")
db['h_type_loc'] = db['h_type_loc'].fillna('no_holiday')
db['h_description_loc'] = db['h_description_loc'].fillna('no_holiday')
db['h_transferred_loc'] = db['h_transferred_loc'].fillna('no_holiday')

db = db.merge(transactions_db, on=["date", "store_nbr"], how="left")
db['transactions'] = db['transactions'].fillna(0)

db['date'] = pd.to_datetime(db['date'])

db['year'] = pd.to_datetime(db['date']).dt.year
db['month'] = pd.to_datetime(db['date']).dt.month
db['day'] = pd.to_datetime(db['date']).dt.day
db['day_of_week'] = db['date'].dt.dayofweek

db = db[db['date'] >= '2013-02-01']

group = ['family', 'day_of_week', 'store_nbr']
cols = ['sales', 'transactions']
names = ['dow_avg', 'dow_rolling_3', 'dow_rolling_7']
agg_funcs = [
    lambda x: x.expanding().mean().shift(1).fillna(0),
    lambda x: x.rolling(window=3).mean().shift(1).fillna(0),
    lambda x: x.rolling(window=7).mean().shift(1).fillna(0),
]
dow_aves_db = grouped_apply_agg(
    db[['date', 'store_nbr', 'transactions', 'family', 'sales', 'day_of_week']].copy(),
    group,
    cols, 
    names, 
    agg_funcs
).drop(columns=['sales', 'transactions', 'day_of_week'])

db = db.merge(dow_aves_db, on=['date','store_nbr', 'family'], how='left')

group = ['family', 'store_nbr']
cols = ['sales', 'transactions']
names = ['rolling_7', 'rolling_14']
agg_funcs = [
    lambda x: x.rolling(window=7).mean().shift(1).fillna(0),
    lambda x: x.rolling(window=14).mean().shift(1).fillna(0),
]
rolling_db = grouped_apply_agg(
    db[['date', 'store_nbr', 'transactions', 'family', 'sales']].copy(),
    group,
    cols, 
    names, 
    agg_funcs
).drop(columns=['sales', 'transactions'])

db = db.merge(rolling_db, on=['date','store_nbr', 'family'], how='left')

db = db.sort_index()
db = db.drop(columns='transactions')



In [17]:
display(db.head(20))
display(db.info())

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,dcoilwtico,...,dow_avg_sales,dow_rolling_3_sales,dow_rolling_7_sales,dow_avg_transactions,dow_rolling_3_transactions,dow_rolling_7_transactions,rolling_7_sales,rolling_14_sales,rolling_7_transactions,rolling_14_transactions
0,2013-02-01,1,AUTOMOTIVE,3.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013-02-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2013-02-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2013-02-01,1,BEVERAGES,941.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-02-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2013-02-01,1,BREAD/BAKERY,318.274,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2013-02-01,1,CELEBRATION,0.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2013-02-01,1,CLEANING,739.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,1.428571,0.0,1540.428571,0.0
8,2013-02-01,1,DAIRY,430.0,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,1.285714,0.0,1550.714286,0.0
9,2013-02-01,1,DELI,122.34,0,Quito,Pichincha,D,13,97.46,...,0.0,0.0,0.0,0.0,0.0,0.0,1.857143,0.0,1511.714286,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2945646 entries, 0 to 2945645
Data columns (total 30 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   date                        datetime64[ns]
 1   store_nbr                   int64         
 2   family                      object        
 3   sales                       float64       
 4   onpromotion                 int64         
 5   city                        object        
 6   state                       object        
 7   type                        object        
 8   cluster                     int64         
 9   dcoilwtico                  float64       
 10  h_type_nat                  object        
 11  h_description_nat           object        
 12  h_transferred_nat           object        
 13  h_type_loc                  object        
 14  h_description_loc           object        
 15  h_transferred_loc           object        
 16  year              

None

## Renaming and Assigning classes

In [18]:
col_rename = {
    'dcoilwtico': 'oil',
    'type': 'store_type',
}

db = db.rename(columns=col_rename)

col_str_to_class_label = {}

for col in db.columns:
    # skip if the column is numeric
    if is_numeric_dtype(db[col]):
        continue
    elif col == 'date':
        continue
    str_to_class = {}
    for i, val in enumerate(db[col].unique()):
        str_to_class[val] = i
    col_str_to_class_label[col] = str_to_class

for col, dict in col_str_to_class_label.items():
    db[col] = db[col].map(dict)

display(db.tail())

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,store_type,cluster,oil,...,dow_avg_sales,dow_rolling_3_sales,dow_rolling_7_sales,dow_avg_transactions,dow_rolling_3_transactions,dow_rolling_7_transactions,rolling_7_sales,rolling_14_sales,rolling_7_transactions,rolling_14_transactions
2945641,2017-08-15,9,28,438.133,0,0,0,2,6,47.57,...,430.204263,416.112333,382.635571,2065.584746,2039.333333,1989.0,802.714286,1048.214286,525.428571,560.071429
2945642,2017-08-15,9,29,154.553,1,0,0,2,6,47.57,...,80.487241,71.765332,96.329142,2065.584746,2039.333333,1989.0,796.285714,1048.0,516.571429,560.428571
2945643,2017-08-15,9,30,2419.729,148,0,0,2,6,47.57,...,1321.552316,2290.074,2261.490429,2065.584746,2039.333333,1989.0,748.857143,981.785714,508.857143,549.0
2945644,2017-08-15,9,31,121.0,8,0,0,2,6,47.57,...,5.737288,126.0,54.857143,2065.584746,2039.333333,1989.0,700.428571,922.0,487.285714,541.571429
2945645,2017-08-15,9,32,16.0,0,0,0,2,6,47.57,...,14.258445,17.457333,12.538857,2065.584746,2039.333333,1989.0,671.571429,884.214286,486.714286,535.857143


## Calculating Correlation

In [19]:
rows = [
    'sales',
    'store_nbr',
    'family',
    'city',
    'state',
    'store_type',
    'cluster',
    'oil',
    'month',
    'day',
    'h_type_nat',
    'h_type_loc',
    'dow_avg_sales',
    'dow_rolling_3_sales',
    'rolling_7_sales',
    'rolling_14_sales',
    'dow_avg_transactions',
    'dow_rolling_3_transactions',
    'rolling_7_transactions',
    'rolling_14_transactions'
]

cov_db = db[rows].copy()
for col in cov_db.columns:
    cov_db[col] = cov_db[col].astype(np.float32)

cov_matrix = cov_db.corr()

display(cov_matrix)

del cov_db

Unnamed: 0,sales,store_nbr,family,city,state,store_type,cluster,oil,month,day,h_type_nat,h_type_loc,dow_avg_sales,dow_rolling_3_sales,rolling_7_sales,rolling_14_sales,dow_avg_transactions,dow_rolling_3_transactions,rolling_7_transactions,rolling_14_transactions
sales,1.0,0.04148249,-0.113951,-0.09680985,-0.09927258,0.1005338,0.03857158,-0.07298275,0.01572191,-0.0118025,0.01745075,0.004853727,0.917353,0.946211,-0.007179,-0.007175,0.2154982,0.2233816,-0.020327,-0.020152
store_nbr,0.041482,1.0,4.869794e-18,0.5886759,0.5874486,0.5719704,-0.05928399,3.477547e-15,-4.433827e-15,-1.437977e-16,-1.399131e-15,-0.002436825,0.035217,0.041871,-0.001238,-0.001211,0.07445121,0.1087874,-0.002495,-0.002483
family,-0.113951,4.869794e-18,1.0,-1.801533e-20,7.083597999999999e-19,2.097972e-18,4.120518e-19,-2.789456e-14,8.949784e-15,8.563251e-16,1.267927e-15,4.0147e-16,-0.139712,-0.11567,0.000394,0.000268,1.044264e-15,1.760395e-15,0.000289,0.000223
city,-0.09681,0.5886759,-1.801533e-20,1.0,0.9852099,-0.1113357,-0.4230199,2.586724e-15,5.014741e-16,3.4801620000000004e-17,-3.1173280000000004e-17,-0.01529301,-0.109328,-0.098662,-0.000237,-9.9e-05,-0.4871684,-0.4607877,-0.000281,-0.000229
state,-0.099273,0.5874486,7.083597999999999e-19,0.9852099,1.0,-0.107382,-0.4167306,-2.768584e-14,7.551043e-16,6.763881e-16,2.387367e-15,-0.01330964,-0.11125,-0.10118,-1.6e-05,9.6e-05,-0.4917399,-0.4682326,-0.000358,-0.000304
store_type,0.100534,0.5719704,2.097972e-18,-0.1113357,-0.107382,1.0,0.3880565,1.646692e-14,-2.966056e-15,-2.03012e-16,8.637325e-16,0.007276521,0.102002,0.101882,-0.000341,-0.000355,0.365086,0.3854589,-0.001892,-0.001837
cluster,0.038572,-0.05928399,4.120518e-19,-0.4230199,-0.4167306,0.3880565,1.0,-3.428183e-14,2.914834e-15,4.200267e-16,7.677688e-16,0.01085352,0.043412,0.039239,0.001252,0.001075,0.1967398,0.1872026,0.000132,0.000158
oil,-0.072983,3.477547e-15,-2.789456e-14,2.586724e-15,-2.768584e-14,1.646692e-14,-3.428183e-14,1.0,0.0412952,-0.003314884,-0.05361491,-0.00397998,-0.053754,-0.079486,0.044285,0.044438,-0.03489266,-0.07185831,0.158511,0.158266
month,0.015722,-4.433827e-15,8.949784e-15,5.014741e-16,7.551043e-16,-2.966056e-15,2.914834e-15,0.0412952,1.0,0.003961613,0.0526595,0.05228632,0.000373,0.006167,0.034452,0.034535,0.005040334,0.006003687,0.07626,0.076185
day,-0.011802,-1.437977e-16,8.563251e-16,3.4801620000000004e-17,6.763881e-16,-2.03012e-16,4.200267e-16,-0.003314884,0.003961613,1.0,0.005733858,-0.02023306,0.001939,0.005191,0.009476,0.00949,0.008857926,0.01667325,-0.020291,-0.020253


## Saving Data

In [20]:
db.to_csv(os.path.join(data_dir, "train_data.csv"), index=False)

## Making Test Data Set

In [21]:
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"), index_col=0)
index_col = test_df.index

test_df = test_df.merge(store_db, on="store_nbr", how="left")
test_df = test_df.merge(oil_db, on="date", how="left")
test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(method='bfill')

test_df = test_df.merge(holidays_db, on="date", how="left")
test_df['h_type'] = test_df['h_type'].fillna('no_holiday')
test_df['h_locale'] = test_df['h_locale'].fillna('no_locale')
test_df['h_locale_name'] = test_df['h_locale_name'].fillna('no_locale_name')

test_df['date'] = pd.to_datetime(test_df['date'])

test_df['year'] = pd.to_datetime(test_df['date']).dt.year
test_df['month'] = pd.to_datetime(test_df['date']).dt.month
test_df['day'] = pd.to_datetime(test_df['date']).dt.day
test_df['day_of_week'] = test_df['date'].dt.dayofweek

test_df = test_df.rename(columns=col_rename)

display(test_df.head())

for col, dict in col_str_to_class_label.items():
    print(col, dict)
    test_df[col] = test_df[col].map(dict)

most_recent_dow_aves = db.groupby(['store_nbr', 'family', 'day_of_week']).tail(1)
most_recent_dow_aves = most_recent_dow_aves[['store_nbr', 'family', 'day_of_week', 'dow_avg_sales', 'rolling_7_sales', 'rolling_14_sales', 'rolling_30_sales', 'dow_avg_transactions', 'rolling_7_transactions', 'rolling_14_transactions', 'rolling_30_transactions']]

test_df = test_df.merge(most_recent_dow_aves, on=['store_nbr', 'family', 'day_of_week'], how='left')
test_df = test_df.drop(columns=["date"])
test_df.index = index_col

display(test_df.head())



KeyError: 'h_type'

In [None]:
test_df.to_csv(os.path.join(data_dir, "test_data.csv"), index=True)