In [12]:
import numpy as np
import pandas as pd
import datetime as dt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sbn
import scipy.stats as stats

We will add all the additional features to our dataframe

In [13]:
train = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv')
transactions = pd.read_csv('transactions.csv')
holidays = pd.read_csv('holidays_events.csv')

In [14]:
#this is Lars's code to add in a time variable
dates_dt = pd.to_datetime(train['date'])
dates_dt_min = dates_dt.min()
dates_days = (dates_dt - dates_dt_min).dt.days

display(dates_days)
train['time'] = dates_days
display(train)

#we also add a time variable to the holidays dataframe
holidays_days = (pd.to_datetime(holidays['date']) - dates_dt_min).dt.days

display(holidays_days)
holidays['time'] = holidays_days
display(holidays)

0             0
1             0
2             0
3             0
4             0
           ... 
3000883    1687
3000884    1687
3000885    1687
3000886    1687
3000887    1687
Name: date, Length: 3000888, dtype: int64

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,0
1,1,2013-01-01,1,BABY CARE,0.000,0,0
2,2,2013-01-01,1,BEAUTY,0.000,0,0
3,3,2013-01-01,1,BEVERAGES,0.000,0,0
4,4,2013-01-01,1,BOOKS,0.000,0,0
...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1687
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1687
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1687
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1687


0      -305
1      -275
2      -264
3      -262
4      -255
       ... 
345    1816
346    1817
347    1818
348    1819
349    1820
Name: date, Length: 350, dtype: int64

Unnamed: 0,date,type,locale,locale_name,description,transferred,time
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,-305
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,-275
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,-264
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,-262
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,-255
...,...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False,1816
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False,1817
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False,1818
348,2017-12-25,Holiday,National,Ecuador,Navidad,False,1819


First we define a function to check if a day is a holiday or not for a specific store, dependent on the region and locale of the store.

In [15]:
#write a function to check if a day is a holiday or event for a specific store
def is_holiday(t, st):
    #check if the day shows up in the holidays_events dataset
    if np.isin(t,holidays['time']):
        #if it is record the index of the holiday
        holidayindex = np.where(holidays['time'] == t)[0][0]
        storeindex = np.where(stores['store_nbr'] == st)[0][0]
        #check if the holiday is transferred away from this day
        if holidays['transferred'][holidayindex]:
            #if the holiday is transferred treat it like a normal day
            return 'No'
        else:
            #check if the holiday is transferred from another holiday
            if holidays['type'][holidayindex] == 'Transfer':
                #if it is transferred treat it like a holiday
                d = 'Holiday'
            else:
                d = holidays['type'][holidayindex]
            #check if the holiday affects the region
            if holidays['locale'][holidayindex] == 'National':
                return d
            else:
                #check if the holiday is regional or local
                if holidays['locale'][holidayindex] == "Regional":
                    #check if the store is in the region
                    if stores['state'][storeindex] == holidays['locale_name'][holidayindex]:
                        return d
                    else:
                        return 'No'
                else:
                    #check if the store is in the locality of the local holiday
                        if stores['city'][storeindex] == holidays['locale_name'][holidayindex]:
                            return d
                        else:
                            return 'No'
    #if it is not a holiday return 'No'
    else:
        return 'No'

                    

We then test that it works

In [16]:
print(is_holiday(t=341, st = 38))
print(is_holiday(t=341, st = 3))
print(is_holiday(t=0, st = 3))

Holiday
No
Holiday


Now we add holidays to the test dataframe

In [17]:
vector_holiday = np.vectorize(is_holiday)

holiday_ar = vector_holiday(train['time'],train['store_nbr'])

display(holiday_ar)

array(['Holiday', 'Holiday', 'Holiday', ..., 'No', 'No', 'No'],
      dtype='<U10')

In [18]:
train['holiday'] = holiday_ar
display(train)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time,holiday
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,0,Holiday
1,1,2013-01-01,1,BABY CARE,0.000,0,0,Holiday
2,2,2013-01-01,1,BEAUTY,0.000,0,0,Holiday
3,3,2013-01-01,1,BEVERAGES,0.000,0,0,Holiday
4,4,2013-01-01,1,BOOKS,0.000,0,0,Holiday
...,...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1687,No
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1687,No
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1687,No
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1687,No


Now we add the rest of our features

In [19]:
#This is Ciaran's code to add days since last paycheck for public employees

def day_date_int(str):
    return 10 * int(str[-2]) + int(str[-1])


lastdayofmonth = {'01-31', '02-28', '03-31', '04-30', '05-31', '06-30', '07-31', '08-31', '09-30', '10-31', '11-30',
                  '12-31'}


def datetodayssincelastpayday(str):
    if str == '2016-02-28':
        return 13
    if str == '2016-02-29':
        return 0
    if str[-5:] in lastdayofmonth:
        return 0
    else:
        if day_date_int(str) >= 15:
            return day_date_int(str) - 15
        else:
            return day_date_int(str)


arr = [0] * len(train['date'])
for i in range(len(train['date'])):
    arr[i] = datetodayssincelastpayday(train['date'][i])
train['dayssincepaid'] = arr

In [20]:
display(train)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time,holiday,dayssincepaid
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,0,Holiday,1
1,1,2013-01-01,1,BABY CARE,0.000,0,0,Holiday,1
2,2,2013-01-01,1,BEAUTY,0.000,0,0,Holiday,1
3,3,2013-01-01,1,BEVERAGES,0.000,0,0,Holiday,1
4,4,2013-01-01,1,BOOKS,0.000,0,0,Holiday,1
...,...,...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1687,No,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1687,No,0
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1687,No,0
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1687,No,0


Now we add Lars's code to complete the oil data to fill in the gaps on weekends when oil was not reported

In [21]:
oil_txf = oil.copy()
oil_txf['date'] = pd.to_datetime(oil['date'])
oil_txf['day_num'] = (oil_txf['date'] - dates_dt_min).dt.days

In [22]:
# def get_oil_price(date):
#     if date.day_of_week == 5:
#         return oil_txf
# date = oil_filled['date'].iloc[1]
def get_oil_price(date : dt.datetime):
    # print(date)
    # print(date.day_of_week)
    tgt = date
    if date.day_of_week == 5:
        tgt = date - dt.timedelta(days = 1)
    if date.day_of_week == 6:
        tgt = date - dt.timedelta(days = 2)
    # print(tgt)
    price = oil_txf[oil_txf['date'] == tgt]['dcoilwtico'].iloc[0]
    return price

In [23]:
time_arange = np.arange(1688)
oil_filled = pd.DataFrame(index = time_arange, data = {
    'day_num' : pd.to_timedelta(time_arange, unit = 'days')
})
oil_filled['date'] = oil_filled['day_num'] + dates_dt_min
oil_filled['price'] = oil_filled['date'].apply(get_oil_price)
display(oil_filled)

Unnamed: 0,day_num,date,price
0,0 days,2013-01-01,
1,1 days,2013-01-02,93.14
2,2 days,2013-01-03,92.97
3,3 days,2013-01-04,93.12
4,4 days,2013-01-05,93.12
...,...,...,...
1683,1683 days,2017-08-11,48.81
1684,1684 days,2017-08-12,48.81
1685,1685 days,2017-08-13,48.81
1686,1686 days,2017-08-14,47.59


Now we add the filled in oil feature to our dataframe

In [24]:
train['oil'] = train['time'].apply(oil_filled['price'].get)

In [25]:
display(train)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time,holiday,dayssincepaid,oil
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,0,Holiday,1,
1,1,2013-01-01,1,BABY CARE,0.000,0,0,Holiday,1,
2,2,2013-01-01,1,BEAUTY,0.000,0,0,Holiday,1,
3,3,2013-01-01,1,BEVERAGES,0.000,0,0,Holiday,1,
4,4,2013-01-01,1,BOOKS,0.000,0,0,Holiday,1,
...,...,...,...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1687,No,0,47.57
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1687,No,0,47.57
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1687,No,0,47.57
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1687,No,0,47.57


In [26]:
train['dayofweek'] = pd.to_datetime(train['date']).dt.day_of_week
train['month'] = pd.to_datetime(train['date']).dt.month

In [27]:
display(train)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,time,holiday,dayssincepaid,oil,dayofweek,month
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,0,Holiday,1,,1,1
1,1,2013-01-01,1,BABY CARE,0.000,0,0,Holiday,1,,1,1
2,2,2013-01-01,1,BEAUTY,0.000,0,0,Holiday,1,,1,1
3,3,2013-01-01,1,BEVERAGES,0.000,0,0,Holiday,1,,1,1
4,4,2013-01-01,1,BOOKS,0.000,0,0,Holiday,1,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,1687,No,0,47.57,1,8
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,1687,No,0,47.57,1,8
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,1687,No,0,47.57,1,8
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,1687,No,0,47.57,1,8


Now we do some feature selection. We will proceed work forward and check the adjusted Rsquared for various features related to sales. This was done alongside Ciaran and the code was from both of us (modified from notebook 3b from the second INMAS workshop).

We first start with a very coarse look at all the stores and families of items together.

In [28]:
results_oil_only = smf.ols("sales ~ oil", data = train).fit()
results_dayofweek_only = smf.ols("sales ~ dayofweek", data = train).fit()
results_holiday_only = smf.ols("sales ~ holiday", data = train).fit()
results_dayssincepaid_only = smf.ols("sales ~ dayssincepaid", data = train).fit()
#results_lag_only = smf.ols("sales ~ lag", data = train).fit()
results_on_promotion_only = smf.ols("sales ~ onpromotion", data = train).fit()
results_month_only = smf.ols("sales ~ month", data = train).fit()


print("Adj Rsquared for oil only is "+ str(results_oil_only.rsquared_adj))
print("Adj Rsquared for dayofweek only is "+ str(results_dayofweek_only.rsquared_adj))
print("Adj Rsquared for holiday only is "+ str(results_holiday_only.rsquared_adj))
print("Adj Rsquared for dayssincepaid only is "+ str(results_dayssincepaid_only.rsquared_adj))
#print("Adj Rsquared for lag only is "+ str(results_lag_only.rsquared_adj))
print("Adj Rsquared for onpromotion only is "+ str(results_on_promotion_only.rsquared_adj))
print("Adj Rsquared for month only is "+ str(results_month_only.rsquared_adj))

Adj Rsquared for oil only is 0.005631195206287631
Adj Rsquared for dayofweek only is 0.0013589882968736422
Adj Rsquared for holiday only is 0.0005085268305171642
Adj Rsquared for dayssincepaid only is 0.00017219533383638908
Adj Rsquared for onpromotion only is 0.18311799700299636
Adj Rsquared for month only is 0.0003912977876842083


Now we separate by the different families of items and look at how each individual family relates to the various features

In [31]:
#This is Ciaran's code to stratify by features
data = train.copy()
data_AUTOMOTIVE=data[data['family']=='AUTOMOTIVE']
data_BABYCARE=data[data['family']=='BABY CARE']
data_BEAUTY=data[data['family']=='BEAUTY']
data_BEVERAGES=data[data['family']=='BEVERAGES']
data_BOOKS=data[data['family']=='BOOKS']
data_BREADBAKERY=data[data['family']=='BREAD/BAKERY']
data_CELEBRATION=data[data['family']=='CELEBRATION']
data_CLEANING=data[data['family']=='CLEANING']
data_DAIRY=data[data['family']=='DAIRY']
data_DELI=data[data['family']=='DELI']
data_EGGS=data[data['family']=='EGGS']
data_FROZENFOODS=data[data['family']=='FROZEN FOODS']
data_GROCERYI=data[data['family']=='GROCERY I']
data_GROCERYII=data[data['family']=='GROCERY II']
data_HARDWARE=data[data['family']=='HARDWARE']
data_HOMEANDKITCHENI=data[data['family']=='HOME AND KITCHEN I']
data_HOMEANDKITCHENII=data[data['family']=='HOME AND KITCHEN II']
data_HOMEAPPLIANCES=data[data['family']=='HOME APPLIANCES']
data_HOMECARE=data[data['family']=='HOME CARE']
data_LADIESWEAR=data[data['family']=='LADIESWEAR']
data_LAWNANDGARDEN=data[data['family']=='LAWN AND GARDEN']
data_LINGERIE=data[data['family']=='LINGERIE']
data_LIQUORWINEBEER=data[data['family']=='LIQUOR,WINE,BEER']
data_MAGAZINES=data[data['family']=='MAGAZINES']
data_MEATS=data[data['family']=='MEATS']
data_PERSONALCARE=data[data['family']=='PERSONAL CARE']
data_PETSUPPLIES=data[data['family']=='PET SUPPLIES']
data_PLAYERSANDELECTRONICS=data[data['family']=='PLAYERS AND ELECTRONICS']
data_POULTRY=data[data['family']=='POULTRY']
data_PREPAREDFOODS=data[data['family']=='PREPARED FOODS']
data_PRODUCE=data[data['family']=='PRODUCE']
data_SCHOOLANDOFFICESUPPPLIES=data[data['family']=='SCHOOL AND OFFICE SUPPLIES']
data_SEAFOOD=data[data['family']=='SEAFOOD']

data_stratified_by_type=[data_AUTOMOTIVE,
data_BABYCARE,
data_BEAUTY,
data_BEVERAGES,
data_BOOKS,
data_BREADBAKERY,
data_CELEBRATION,
data_CLEANING,
data_DAIRY,
data_DELI,
data_EGGS,
data_FROZENFOODS,
data_GROCERYI,
data_GROCERYII,
data_HARDWARE,
data_HOMEANDKITCHENI,
data_HOMEANDKITCHENII,
data_HOMEAPPLIANCES,
data_HOMECARE,
data_LADIESWEAR,
data_LAWNANDGARDEN,
data_LINGERIE,
data_LIQUORWINEBEER,
data_MAGAZINES,
data_MEATS,
data_PERSONALCARE,
data_PETSUPPLIES,
data_PLAYERSANDELECTRONICS,
data_POULTRY,
data_PREPAREDFOODS,
data_PRODUCE,
data_SCHOOLANDOFFICESUPPPLIES,
data_SEAFOOD]


Families = ['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREADBAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY', 'PREPARED FOODS',
       'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD']

In [32]:
for x in range(len(data_stratified_by_type)):
    results_oil_only = smf.ols("sales ~ oil + time", data = data_stratified_by_type[x]).fit()
    results_dayofweek_only = smf.ols("sales ~ dayofweek + time", data = data_stratified_by_type[x]).fit()
    results_holiday_only = smf.ols("sales ~ holiday + time", data = data_stratified_by_type[x]).fit()
    results_dayssincepaid_only = smf.ols("sales ~ dayssincepaid + time", data = data_stratified_by_type[x]).fit()
    #results_lag_only = smf.ols("sales ~ lag", data = x).fit()
    results_on_promotion_only = smf.ols("sales ~ onpromotion + time", data = data_stratified_by_type[x]).fit()
    results_month_only = smf.ols("sales ~ month + time", data = data_stratified_by_type[x]).fit()


    print("Adj Rsquared for " + Families[x] + " with respect to oil only is "+ str(results_oil_only.rsquared_adj))
    print("Adj Rsquared for " + Families[x] + " with respect to dayofweek only is "+ str(results_dayofweek_only.rsquared_adj))
    print("Adj Rsquared for " + Families[x] + " with respect to holiday only is "+ str(results_holiday_only.rsquared_adj))
    print("Adj Rsquared for " + Families[x] + " with respect to dayssincepaid only is "+ str(results_dayssincepaid_only.rsquared_adj))
    #print("Adj Rsquared for " + Families[x] + " with respect to lag only is "+ str(results_lag_only.rsquared_adj))
    print("Adj Rsquared for " + Families[x] + " with respect to onpromotion only is "+ str(results_on_promotion_only.rsquared_adj))
    print("Adj Rsquared for " + Families[x] + " with respect to month only is "+ str(results_month_only.rsquared_adj))

Adj Rsquared for AUTOMOTIVE with respect to oil only is 0.02370139039616248
Adj Rsquared for AUTOMOTIVE with respect to dayofweek only is 0.051233369169116916
Adj Rsquared for AUTOMOTIVE with respect to holiday only is 0.025272655674806188
Adj Rsquared for AUTOMOTIVE with respect to dayssincepaid only is 0.024024402111693544
Adj Rsquared for AUTOMOTIVE with respect to onpromotion only is 0.0586760467219789
Adj Rsquared for AUTOMOTIVE with respect to month only is 0.023178800110876363
Adj Rsquared for BABY CARE with respect to oil only is 0.0199341787684455
Adj Rsquared for BABY CARE with respect to dayofweek only is 0.020247953829461984
Adj Rsquared for BABY CARE with respect to holiday only is 0.020218150851972982
Adj Rsquared for BABY CARE with respect to dayssincepaid only is 0.020360845791049353
Adj Rsquared for BABY CARE with respect to onpromotion only is 0.022833058828146013
Adj Rsquared for BABY CARE with respect to month only is 0.02018591192286645
Adj Rsquared for BEAUTY with

Definitely seeing a bit more than we did when we just looked at everything together. Most families seem to depend on 'onpromotion' and 'oil' strongest. But a few like 'LIQUORWINEBEER' seem to be related to 'holidays'

Let's dig deeper and look at trends by individual store as well as family. Since this will be so many cases we will only record the ones that are above a threshold of relevance.

In [33]:
#We make a dictionary to store any meaningful relations that we notice
meaningful = {}
for x in range(len(data_stratified_by_type)):
    for y in range(1, 55):
        results_oil_only = smf.ols("sales ~ oil", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()
        results_dayofweek_only = smf.ols("sales ~ dayofweek", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()
        results_holiday_only = smf.ols("sales ~ holiday", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()
        results_dayssincepaid_only = smf.ols("sales ~ dayssincepaid", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()
        #results_lag_only = smf.ols("sales ~ lag", data = x).fit()
        results_on_promotion_only = smf.ols("sales ~ onpromotion", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()
        results_month_only = smf.ols("sales ~ month", data = data_stratified_by_type[x][(data_stratified_by_type[x]['store_nbr'] == y)]).fit()

        listmeaningful = []
        #here we set 0.1 as our threshhold for meaningfulness, but this can be changed to either loosen or strengthen the relationship
        if results_oil_only.rsquared_adj > 0.1:
            listmeaningful.append('oil')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to oil only is "+ str(results_oil_only.rsquared_adj))
        if results_dayofweek_only.rsquared_adj > 0.1:
            listmeaningful.append('dayofweek')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to dayofweek only is "+ str(results_dayofweek_only.rsquared_adj))
        if results_holiday_only.rsquared_adj > 0.1:
            listmeaningful.append('holidayl')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to holiday only is "+ str(results_holiday_only.rsquared_adj))
        if results_dayssincepaid_only.rsquared_adj > 0.1:
            listmeaningful.append('dayssincepaid')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to dayssincepaid only is "+ str(results_dayssincepaid_only.rsquared_adj))
        #print("Adj Rsquared for " + Families[x] + " with respect to lag only is "+ str(results_lag_only.rsquared_adj))
        if results_on_promotion_only.rsquared_adj > 0.1:
            listmeaningful.append('on_promotion')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to onpromotion only is "+ str(results_on_promotion_only.rsquared_adj))
        if results_month_only.rsquared_adj > 0.1:
            listmeaningful.append('month')
            #print("Adj Rsquared for " + Families[x] + " at store number " + str(y) + " with respect to month only is "+ str(results_month_only.rsquared_adj))
        if listmeaningful:
            meaningful[(Families[x],y)] = listmeaningful

  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


Now we have a dictionary which returns a list of meaningful features for any pair of family and store. This would be a starting point to tweak our model on these features.

In [34]:
display(meaningful)

{('AUTOMOTIVE', 3): ['dayofweek'],
 ('AUTOMOTIVE', 19): ['oil'],
 ('AUTOMOTIVE', 20): ['oil'],
 ('AUTOMOTIVE', 21): ['oil'],
 ('AUTOMOTIVE', 22): ['oil'],
 ('AUTOMOTIVE', 27): ['oil', 'dayofweek'],
 ('AUTOMOTIVE', 29): ['oil'],
 ('AUTOMOTIVE', 31): ['oil'],
 ('AUTOMOTIVE', 33): ['oil'],
 ('AUTOMOTIVE', 35): ['oil'],
 ('AUTOMOTIVE', 36): ['oil'],
 ('AUTOMOTIVE', 38): ['dayofweek'],
 ('AUTOMOTIVE', 42): ['oil'],
 ('AUTOMOTIVE', 44): ['dayofweek'],
 ('AUTOMOTIVE', 45): ['dayofweek'],
 ('AUTOMOTIVE', 46): ['dayofweek'],
 ('AUTOMOTIVE', 47): ['dayofweek'],
 ('AUTOMOTIVE', 48): ['dayofweek'],
 ('AUTOMOTIVE', 50): ['dayofweek'],
 ('AUTOMOTIVE', 52): ['on_promotion'],
 ('AUTOMOTIVE', 53): ['oil'],
 ('AUTOMOTIVE', 54): ['oil'],
 ('BABY CARE', 31): ['oil'],
 ('BEAUTY', 1): ['on_promotion'],
 ('BEAUTY', 3): ['oil', 'dayofweek', 'on_promotion'],
 ('BEAUTY', 6): ['oil', 'on_promotion'],
 ('BEAUTY', 8): ['oil', 'on_promotion'],
 ('BEAUTY', 9): ['oil', 'on_promotion'],
 ('BEAUTY', 11): ['oil', 'on_pr