In [1]:
import pandas as pd
import wrangle
import new_wrangle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import model as m

import math
from sklearn.metrics import mean_squared_error, explained_variance_score

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor 
from sklearn.model_selection import cross_val_score


# Wrangle

In [2]:
df =new_wrangle.wrangle_walmart()

# split and scale before modeling

In [69]:
train, test,  X_train_scaled, X_test_scaled, y_train, y_test = new_wrangle.split_scale(df, 'next_week_sales_target', MinMaxScaler())

train -> (2866, 32)
test -> (1229, 32)


In [72]:
X_train_scaled.shape, X_test_scaled.shape

((2866, 26), (1229, 26))

In [73]:
#features to drop before modeling due to autocorrelation:

In [74]:
col = ['this_week_unemployment', 'fuel_price', 'CPI', 'unemp_quarterly_rolling', 'fuel_quarterly_rolling', 'cpi_quarterly_rolling']

In [75]:
X_train_scaled = X_train_scaled.drop(columns = col)
X_test_scaled = X_test_scaled.drop(columns = col)

In [76]:
X_train_scaled.shape, X_test_scaled.shape

((2866, 20), (1229, 20))

# MODEL

**Note :** Before modeling, we split and scale (MinMax) our df.

For the modeling section, we established the **baseline** using  `next_week_1_year_ago`


- We use 4 different algorithms
    - Linear Regression(OLS)
    - Lasso Lars
    - TweedieRegressor (GLM)
    - Polynomial Regression

For each model we used :

 - **GridSearchCV**: grid search cross-validation (GridSearchCV) class that helps us quickly try out many different combinations of hyperparameters.
 
- **Feature Engineering:**

   - **RFE :** (recursive feature elimination)  recursively removes attributes to meet the required number of features and then builds a model on those attributes that remain.
   - **SelectKBest:** removes all but the highest scoring features.




In [77]:
# #conver y_train y_test to df  (so we can keep track of the predictions)
y_train_df = pd.DataFrame( {'actual': y_train})
y_test_df = pd.DataFrame( {'actual': y_test})

##  - BASELINE

In [78]:
#baseline version using last years sales
y_train_df['last_year_baseline'] = train['next_week_1_year_ago']

In [79]:
#calculate RMSE for baseline model
rmse_baseline2_train= math.sqrt(mean_squared_error(y_train_df.actual, y_train_df.last_year_baseline))

rmse_baseline2_train

91145.28223498359

In [80]:
#acreate a metric df to store the metrics of each model
metric_df = pd.DataFrame(data = [{
    'model': 'baseline(using last year sales)',
    'rmse_train':rmse_baseline2_train,    
    'r^2' : 0}])

## MODEL: LinearRegression (OLS)

### - Gridsearch

In [81]:
#specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.
params = {'normalize': [ True, False],
          'fit_intercept': [True, False]}

In [82]:
#use a gridsearch function using 
m.gridsearch (X_train_scaled, y_train, LinearRegression() , params, 'neg_root_mean_squared_error')

Unnamed: 0,fit_intercept,normalize,score
2,False,True,-78124.182925
3,False,False,-78124.182925
0,True,True,-70665.229164
1,True,False,-70665.229164


### OLS uising select K best (6)

In [83]:
#using my function for SelectkBest
top_sb =m.select_kbest(X_train_scaled, y_train, 10)

The top 10 selected feautures based on the SelectKBest class are: ['this_week_sales', 'temperature', 'store_size', 'next_week_1_year_ago', 'next_week_holiday_flag', 'pre_christmas', 'thanksgiving', 'cpi_4wk_rolling', 'unemp_4wk_rolling', 'avgQoQ_perc_unemp']


In [85]:
#create the model
ols_sb = m.create_model(X_train_scaled[top_sb], y_train_df, 'actual',\
                       LinearRegression(normalize=False,\
                        fit_intercept=True ), 'modelOLS' )
ols_sb['rmse']

72474.37897198633

#### cross validation

In [86]:
#create the model
clf = LinearRegression(normalize=True, fit_intercept=True )
#cv = number of folds
cross_val_score(clf, X_train_scaled[top_sb], y_train, cv=3, scoring = 'neg_root_mean_squared_error')

array([-72769.62947968, -73703.84102292, -73404.54496025])

### OLS uising  RFE (8)

In [87]:
#uise the fucntion to get RFE
top_rfe = m.select_rfe(X_train_scaled, y_train, 10,LinearRegression(normalize=True, fit_intercept=True ))

The top 10 selected feautures based on the the RFE class class are: ['this_week_sales', 'this_week_holiday_flag', 'store_size', 'next_week_1_year_ago', 'christmas', 'thanksgiving', 'unemp_4wk_rolling', 'avgMoM_perc_unemp', 'avgQoQ_perc_fuel', 'avgQoQ_perc_unemp']
this_week_sales            1
avgQoQ_perc_fuel           1
avgMoM_perc_unemp          1
unemp_4wk_rolling          1
thanksgiving               1
christmas                  1
avgQoQ_perc_unemp          1
next_week_1_year_ago       1
store_size                 1
this_week_holiday_flag     1
avgMoM_perc_cpi            2
labor_day                  3
super_bowl                 4
fuel_4wk_rolling           5
pre_christmas              6
avgMoM_perc_fuel           7
avgQoQ_perc_cpi            8
next_week_holiday_flag     9
temperature               10
cpi_4wk_rolling           11
dtype: int64


In [88]:
#create the model
ols_rfe = m.create_model(X_train_scaled[top_rfe], y_train_df, 'actual', LinearRegression(normalize=True, fit_intercept=True ), 'modelOLS' )
ols_rfe['rmse']

70339.07052044076

#### cross validation

In [89]:
#create the model
clf = LinearRegression(normalize=True, fit_intercept=True )
#cv = number of folds
cross_val_score(clf, X_train_scaled[top_rfe], y_train, cv=3, scoring = 'neg_root_mean_squared_error')

array([-70080.50217941, -71528.662538  , -71789.27133027])

In [90]:
#add the best model  metrics to our metric_df
metric_df = metric_df.append(
    {
    'model': 'ols_rfe',
    'rmse_train': ols_rfe['rmse'],    
    'r^2' : ols_rfe['r2']}, ignore_index=True)

## Model :LassoLars

### - GridsearchCV

In [91]:
#specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.
params = {
          'normalize': [True, False],
          'fit_intercept':[True, False],
           'alpha': [1.0, 0]
         }

In [92]:
#use the function to get the combinations of parameters
m.gridsearch (X_train_scaled, y_train, LassoLars() , params, 'neg_root_mean_squared_error')

Unnamed: 0,alpha,fit_intercept,normalize,score
6,0.0,False,True,-78124.182925
7,0.0,False,False,-78124.182925
2,1.0,False,True,-78123.766036
3,1.0,False,False,-78123.766036
5,0.0,True,False,-70665.229164
4,0.0,True,True,-70665.229164
1,1.0,True,False,-70664.424471
0,1.0,True,True,-70657.415397


### LassoLars using RFE 

In [93]:
#get the 6 features
top_rfe = m.select_rfe(X_train_scaled, y_train, 6, LassoLars(alpha = 1, normalize= True, fit_intercept= True) )

The top 6 selected feautures based on the the RFE class class are: ['this_week_sales', 'store_size', 'next_week_1_year_ago', 'christmas', 'thanksgiving', 'avgQoQ_perc_unemp']
this_week_sales            1
thanksgiving               1
christmas                  1
next_week_1_year_ago       1
avgQoQ_perc_unemp          1
store_size                 1
unemp_4wk_rolling          2
this_week_holiday_flag     3
avgQoQ_perc_fuel           4
avgMoM_perc_unemp          5
avgMoM_perc_cpi            6
next_week_holiday_flag     7
fuel_4wk_rolling           8
avgMoM_perc_fuel           9
pre_christmas             10
labor_day                 11
avgQoQ_perc_cpi           12
cpi_4wk_rolling           13
temperature               14
super_bowl                15
dtype: int64


In [95]:
#create the model
lasso_rfe = m.create_model(X_train_scaled[top_rfe], y_train_df, 'actual', LassoLars(alpha = 1, normalize= False, fit_intercept= True), 'modelLasso' )
lasso_rfe['rmse']

71522.66640807404

In [22]:
### cross validation

In [23]:
#create the model
clf = LassoLars(alpha = 1, normalize= False, fit_intercept= True)
#cv = number of folds
cross = cross_val_score(clf, X_train_scaled[top_rfe], y_train, cv=3, scoring = 'neg_root_mean_squared_error')
cross

array([-72369.74422049, -73054.01907945, -72590.01601532])

In [24]:
### LassoLars using selectKbest

In [106]:
#using my function for SelectkBest
top_sb =m.select_kbest(X_train_scaled, y_train, 9)

The top 9 selected feautures based on the SelectKBest class are: ['this_week_sales', 'temperature', 'store_size', 'next_week_1_year_ago', 'next_week_holiday_flag', 'pre_christmas', 'thanksgiving', 'cpi_4wk_rolling', 'unemp_4wk_rolling']


In [107]:
lasso_skb = m.create_model(X_train_scaled[top_sb], y_train_df, 'actual', LassoLars(alpha = 1, normalize= True, fit_intercept= True), 'modelLasso' )
lasso_skb['rmse']

73370.09336590652

### cross validation

In [108]:
#create the model
clf = LassoLars(alpha = 1, normalize= False, fit_intercept= True)
#cv = number of folds
cross= cross_val_score(clf, X_train_scaled[top_sb], y_train, cv=3, scoring = 'neg_root_mean_squared_error')
cross

array([-74045.96153329, -74430.52250561, -74022.31451342])

**Note: the best LassoLars model is lasso_rfe**

In [109]:
#add the best model and its metrics
metric_df = metric_df.append(
    {
    'model': 'lasso_rfe',
    'rmse_train': lasso_rfe['rmse'],    
    'r^2' : lasso_rfe['r2']}, ignore_index=True)

## TweedieRegressor (GLM)

### - Gridsearch CV

In [110]:
#specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.
params = {
          'power': [0.0, 1],
           'fit_intercept' : [True , False],
          'warm_start': [True, False], 
           'alpha': [1.0, 0.0]
         }
#use the function
m.gridsearch (X_train_scaled, y_train,TweedieRegressor() , params, 'neg_root_mean_squared_error')

Unnamed: 0,alpha,fit_intercept,power,warm_start,score
14,0.0,False,1.0,True,-1383688.0
15,0.0,False,1.0,False,-1383688.0
6,1.0,False,1.0,True,-1383662.0
7,1.0,False,1.0,False,-1383662.0
4,1.0,False,0.0,True,-529112.3
5,1.0,False,0.0,False,-529112.3
0,1.0,True,0.0,True,-496480.9
1,1.0,True,0.0,False,-496480.9
10,0.0,True,1.0,True,-144156.7
11,0.0,True,1.0,False,-144156.7


### create GLM using RFE

In [119]:
#use function to get the top 6 RFE
top_rfe = m.select_rfe(X_train_scaled, y_train, 7, TweedieRegressor(alpha =0 , fit_intercept= True, power=0 ,\
                                                                  warm_start= False) )

The top 7 selected feautures based on the the RFE class class are: ['this_week_sales', 'store_size', 'next_week_1_year_ago', 'christmas', 'thanksgiving', 'unemp_4wk_rolling', 'avgQoQ_perc_unemp']
this_week_sales            1
unemp_4wk_rolling          1
thanksgiving               1
christmas                  1
avgQoQ_perc_unemp          1
store_size                 1
next_week_1_year_ago       1
this_week_holiday_flag     2
avgQoQ_perc_fuel           3
avgMoM_perc_unemp          4
avgMoM_perc_cpi            5
labor_day                  6
super_bowl                 7
fuel_4wk_rolling           8
pre_christmas              9
avgMoM_perc_fuel          10
avgQoQ_perc_cpi           11
next_week_holiday_flag    12
temperature               13
cpi_4wk_rolling           14
dtype: int64


In [120]:
#create the model
gml_rfe = m.create_model(X_train_scaled[top_rfe], y_train_df, 'actual',TweedieRegressor(alpha =0 , fit_intercept= True, power=0 ,\
                                                                  warm_start= False), 'modelgml' )
gml_rfe['rmse']

71197.37052405278

In [121]:
#### cross validation

In [122]:
#create the model
clf = TweedieRegressor(alpha =0 , fit_intercept= True, power=0 ,warm_start= False)
#cv = number of folds
cross = cross_val_score(clf, X_train_scaled[top_rfe], y_train, scoring = 'neg_root_mean_squared_error', cv=3)
cross

array([-71374.06029116, -72012.42497728, -72054.63146727])

### create GML using select kbest

In [129]:
#using my function for SelectkBest
top_sb =m.select_kbest(X_train_scaled, y_train, 7)

The top 7 selected feautures based on the SelectKBest class are: ['this_week_sales', 'temperature', 'store_size', 'next_week_1_year_ago', 'pre_christmas', 'cpi_4wk_rolling', 'unemp_4wk_rolling']


In [130]:
#create a model
gml_skb = m.create_model(X_train_scaled[top_sb], y_train_df, 'actual',TweedieRegressor(alpha =0 , fit_intercept= True, power=0 ,\
                                                                  warm_start= False), 'modelgml' )
gml_skb['rmse']

74330.91020801304

#### cross validation

In [131]:
#create the model
clf = TweedieRegressor(alpha =0 , fit_intercept= True, power=0 ,warm_start= False)
#cv = number of folds
cross = cross_val_score(clf, X_train_scaled[top_sb], y_train, scoring = 'neg_root_mean_squared_error',cv=3)
cross

array([-75377.89622107, -73349.13363488, -75559.29396566])

**Note the best model for GLM is gml_rfe**

In [132]:
#add the best model
metric_df = metric_df.append(
    {
    'model': 'gml_rfe',
    'rmse_train': gml_rfe['rmse'],    
    'r^2' : gml_rfe['r2']}, ignore_index=True)

# Polynomial Regression

###  - Polynomila Regression using select k best 

In [309]:
#using my function for SelectkBest
top_sb =m.select_kbest(X_train_scaled, y_train, 10)

The top 10 selected feautures based on the SelectKBest class are: ['this_week_sales', 'temperature', 'store_size', 'next_week_1_year_ago', 'next_week_holiday_flag', 'pre_christmas', 'thanksgiving', 'cpi_4wk_rolling', 'unemp_4wk_rolling', 'avgQoQ_perc_unemp']


In [310]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree = 2) 

# fit and transform X_train_scaled
X_train_degree3 = pf.fit_transform(X_train_scaled[top_sb])

# transform X_validate_scaled & X_test_scaled
X_test_degree3 = pf.transform(X_test_scaled[top_sb])

### GridSearchCV

In [311]:
#specify the parameters we wish to use as a dictionary, then use that dictionary when we create the class.
params = {
          'normalize': [True, False],
          'fit_intercept':[True, False],
         }

m.gridsearch (X_train_degree3, y_train, LinearRegression() , params, 'neg_root_mean_squared_error')

Unnamed: 0,fit_intercept,normalize,score
1,True,False,-68537.823324
0,True,True,-67663.495945
2,False,True,-67280.147803
3,False,False,-67280.147803


In [312]:
#create a model
pol_skb = m.create_model(X_train_degree3, y_train_df, 'actual',LinearRegression( normalize=True, fit_intercept = True ), 'pol2_skb' )
pol_skb['rmse']

64609.20374104813

##### cross validation

In [313]:
#create the model
clf = LinearRegression( normalize=False, fit_intercept = True )
#cv = number of folds
cross = cross_val_score(clf, X_train_degree3, y_train, cv=5, scoring = 'neg_root_mean_squared_error')
cross

array([-66016.67167466, -67841.00338362, -70257.44391932, -67202.36159337,
       -71371.636051  ])

In [168]:
metric_df = metric_df.append(
    {
    'model': 'pol2_skb',
    'rmse_train': pol_skb['rmse'],    
    'r^2' : pol_skb['r2']}, ignore_index=True)

### -Polynoliam regression using RFE

In [314]:
#get the top 7 features by RFE
top_rfe_pol = m.select_rfe(X_train_scaled, y_train, 11, LinearRegression(normalize=False, fit_intercept = False) )

The top 11 selected feautures based on the the RFE class class are: ['this_week_sales', 'temperature', 'store_size', 'next_week_1_year_ago', 'christmas', 'pre_christmas', 'super_bowl', 'thanksgiving', 'avgMoM_perc_cpi', 'avgMoM_perc_unemp', 'avgQoQ_perc_unemp']
this_week_sales            1
avgMoM_perc_unemp          1
avgMoM_perc_cpi            1
thanksgiving               1
pre_christmas              1
christmas                  1
super_bowl                 1
next_week_1_year_ago       1
store_size                 1
temperature                1
avgQoQ_perc_unemp          1
avgQoQ_perc_fuel           2
this_week_holiday_flag     3
avgQoQ_perc_cpi            4
unemp_4wk_rolling          5
avgMoM_perc_fuel           6
cpi_4wk_rolling            7
next_week_holiday_flag     8
labor_day                  9
fuel_4wk_rolling          10
dtype: int64


In [315]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree = 2) 

# fit and transform X_train_scaled
X_train_degree3 = pf.fit_transform(X_train_scaled[top_rfe_pol])

# transform X_validate_scaled & X_test_scaled
X_test_degree3 = pf.transform(X_test_scaled[top_rfe_pol])

#### GridsearchCV

In [316]:
m.gridsearch (X_train_degree3, y_train, LinearRegression() , params, 'neg_root_mean_squared_error')

Unnamed: 0,fit_intercept,normalize,score
0,True,True,-66226.57328
1,True,False,-66187.084301
2,False,True,-66187.084301
3,False,False,-66187.084301


In [317]:
#create the model
pol_rfe = m.create_model(X_train_degree3, y_train_df, 'actual',LinearRegression( normalize=False, fit_intercept = False ), 'pol3_RFE' )
pol_rfe['rmse']

63537.88275172522

#### cross validation

In [318]:
#create the model
clf = LinearRegression( normalize=False, fit_intercept = False )
#cv = number of folds
cross = cross_val_score(clf, X_train_degree3, y_train, cv=5, scoring = 'neg_root_mean_squared_error')
cross

array([-62337.00446424, -65569.35237058, -68225.96990924, -66758.2633866 ,
       -68044.83137326])

In [319]:
metric_df = metric_df.append(
    {
    'model': 'pol2_rfe',
    'rmse_train': pol_rfe['rmse'],    
    'r^2' : pol_rfe['r2']}, ignore_index=True)

### -Polynomial Regression Degree 1  with all features

In [331]:
# make the polynomial features to get a new set of features using the entire df
pf = PolynomialFeatures(degree = 1) 

# fit and transform X_train_scaled
X_train_degree1 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_test_degree1 = pf.transform(X_test_scaled)

### GreidSearch CV

In [332]:
m.gridsearch (X_train_degree1, y_train, LinearRegression() , params, 'neg_root_mean_squared_error')

Unnamed: 0,fit_intercept,normalize,score
2,False,True,-70665.229164
3,False,False,-70665.229164
0,True,True,-70665.229164
1,True,False,-70665.229164


In [335]:
pol_dg1 = m.create_model(X_train_degree1, y_train_df, 'actual',LinearRegression( normalize=False, fit_intercept = True ), 'modelpol' )
pol_dg1['rmse']

70077.52211783333

### Cross Validation

In [336]:
#create the model
clf = LinearRegression( normalize=False, fit_intercept = True )
#cv = number of folds
cross = cross_val_score(clf, X_train_degree1, y_train, cv=3, scoring = 'neg_root_mean_squared_error')
cross

array([-69760.12765581, -71425.17819823, -71893.93591105])

In [337]:
#add this model metrics to df
metric_df = metric_df.append(
    {
    'model': 'pol_dg1',
    'rmse_train': pol_dg1['rmse'],    
    'r^2' : pol_dg1['r2']}, ignore_index=True)

## RESULTS

In [338]:
metric_df.sort_values('rmse_train')

Unnamed: 0,model,rmse_train,r^2
5,pol2_rfe,63537.882752,0.987056
4,pol2_skb,64609.203741,0.987856
6,pol_dg1,70077.522118,0.984254
1,ols_rfe,70339.07052,0.984137
3,gml_rfe,71197.370524,0.983747
2,lasso_rfe,71522.666408,0.983573
0,baseline(using last year sales),91145.282235,0.0


In [339]:
print('The best model is' )
metric_df.nsmallest(1, 'rmse_train')

The best model is


Unnamed: 0,model,rmse_train,r^2
5,pol2_rfe,63537.882752,0.987056


**Takeaways**

the best model is Polynomial Regression Degree 2 using RFE top 11 Features


In [341]:
top_rfe_pol

['this_week_sales',
 'temperature',
 'store_size',
 'next_week_1_year_ago',
 'christmas',
 'pre_christmas',
 'super_bowl',
 'thanksgiving',
 'avgMoM_perc_cpi',
 'avgMoM_perc_unemp',
 'avgQoQ_perc_unemp']

# TEST

In [344]:
#use my function to create and calculate the metrics
pol_reg_test =  m.create_model(X_test_degree3, 
                              y_test_df, 'actual',LinearRegression( normalize=False, fit_intercept = True ), 'test_polreg' )

pol_reg_test['rmse']

64607.17186826456

In [356]:
print('The Baseline RMSE is $', round(rmse_baseline2_train,2))
print('The Best Model RMSE on unseen data is $', round(pol_reg_test['rmse'],2))

The Baseline RMSE is $ 91145.28
The Best Model RMSE on unseen data is $ 64607.17


In [359]:
# it is a 29% drop
(pol_reg_test['rmse'] / rmse_baseline2_train -1)*100

-29.116274277697187

# END

___________________________________

In [None]:
# we need a prediction df

In [345]:
y_test_df.head(15)

Unnamed: 0_level_0,actual,test_polreg
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-03-18_store_7_2011-03-25,559061.58,552713.7
2012-10-19_store_41_2012-10-26,1316542.59,1335390.0
2012-02-17_store_36_2012-02-24,313270.45,350592.0
2012-03-16_store_26_2012-03-23,874790.68,885834.8
2012-03-02_store_43_2012-03-09,636677.67,667834.7
2012-02-03_store_42_2012-02-10,674919.45,655594.5
2011-11-25_store_41_2011-12-02,1292436.23,1276042.0
2012-02-17_store_23_2012-02-24,1272948.27,1239638.0
2011-04-08_store_23_2011-04-15,1263680.51,1301305.0
2012-10-05_store_12_2012-10-12,934917.47,974715.4


In [363]:
y_test_df.shape

(1229, 2)

In [361]:
df[['this_week_date', 'next_week_date']].head()

Unnamed: 0_level_0,this_week_date,next_week_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-02-05_store_1_2010-02-12,2010-02-05,2010-02-12
2010-02-12_store_1_2010-02-19,2010-02-12,2010-02-19
2010-02-19_store_1_2010-02-26,2010-02-19,2010-02-26
2010-02-26_store_1_2010-03-05,2010-02-26,2010-03-05
2010-03-05_store_1_2010-03-12,2010-03-05,2010-03-12


In [362]:
y_test_df.merge(df[['this_week_date', 'next_week_date']], how = 'left', on = 'id')

Unnamed: 0_level_0,actual,test_polreg,this_week_date,next_week_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-03-18_store_7_2011-03-25,559061.58,5.527137e+05,2011-03-18,2011-03-25
2012-10-19_store_41_2012-10-26,1316542.59,1.335390e+06,2012-10-19,2012-10-26
2012-02-17_store_36_2012-02-24,313270.45,3.505920e+05,2012-02-17,2012-02-24
2012-03-16_store_26_2012-03-23,874790.68,8.858348e+05,2012-03-16,2012-03-23
2012-03-02_store_43_2012-03-09,636677.67,6.678347e+05,2012-03-02,2012-03-09
...,...,...,...,...
2012-08-03_store_28_2012-08-10,1269113.41,1.308460e+06,2012-08-03,2012-08-10
2011-01-28_store_37_2011-02-04,583835.18,5.536380e+05,2011-01-28,2011-02-04
2011-04-01_store_10_2011-04-08,1870720.73,1.882950e+06,2011-04-01,2011-04-08
2011-08-26_store_21_2011-09-02,705557.80,7.719169e+05,2011-08-26,2011-09-02


In [63]:
y_train_df[['actual','pol3_RFE']].head()

Unnamed: 0_level_0,actual,pol3_RFE
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-09-16_store_12_2011-09-23,871692.74,908209.6
2012-08-10_store_4_2012-08-17,2283540.3,2326712.0
2011-12-09_store_27_2011-12-16,2205919.86,2211375.0
2011-12-30_store_1_2012-01-06,1550369.92,1500979.0
2012-08-31_store_43_2012-09-07,663814.18,594255.5
