In [15]:
import pandas as pd
import numpy as np
import copy as cp

import statsmodels.api as sm
import matplotlib.pyplot as plt

import sklearn.metrics as metrics

from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost.sklearn import XGBRegressor

from heatmap import heatmap, corrplot

In [2]:
onedrive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"

In [3]:
BRANDS = ['Aveda','Clinique', 'Estée Lauder', 'Jo Malone London', 'La Mer', 'M.A.C', 'Origins', 'Tom Ford Beauty']

In [4]:
YEAR_MIN = 2017

In [5]:
def elasticnet(brand, alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]):
    print("Feature Selection, " + brand)
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr
    y = just_rr.pop("demand_F1")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)
    
    elastic_cv=ElasticNetCV(alphas=alphas, cv=5,max_iter = 1e6)
    model = elastic_cv.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("OSR^2: " + str(metrics.r2_score(pred, y_test)))

    print(model.coef_)

In [6]:
def build_rr_only_model(brand):
    print("RR Only Model, " + brand)
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr[['month', 'year', 'percent_1', 'percent_2', 'percent_3', 'percent_4',
       'percent_5', 'percent_negative', 'percent_neutral', 'percent_positive']]
    y = just_rr["demand_F1"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    pred = lm.predict(X_test)
    print("OSR^2: " + str(metrics.r2_score(pred, y_test)))
    
    X2 = sm.add_constant(X_train)
    est = sm.OLS(y_train, X2)
    est2 = est.fit()

    print(est2.summary())

In [7]:
def build_demand_selected_rr_model(brand, rr_cols = []):
    print("Demand" + " ".join(rr_cols) + " Model, " + brand)
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr[['demand', 'demand_P2', 'demand_P1', 'avg_subcat_demand',
       'monthly_avg_demand', 'avg_12_month_demand',
       'month', 'year'] + rr_cols]
    y = just_rr["demand_F1"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    pred = lm.predict(X_test)
    print("OSR^2: " + str(metrics.r2_score(pred, y_test)))
    
    X2 = sm.add_constant(X_train)
    est = sm.OLS(y_train, X2)
    est2 = est.fit()

    print(est2.summary())

In [8]:
def build_rr_demand_model(brand):
    print("RR & Demand Model, " + brand)
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr
    y = just_rr.pop("demand_F1")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    pred = lm.predict(X_test)
    print("OSR^2: " + str(metrics.r2_score(pred, y_test)))
    
    X2 = sm.add_constant(X_train)
    est = sm.OLS(y_train, X2)
    est2 = est.fit()

    print(est2.summary())

In [9]:
def build_rr_demand_pca_model(brand, num_features = 10):
    print("RR & Demand PCA Model, " + brand)
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr
    y = just_rr.pop("demand_F1")
    
    X = StandardScaler().fit_transform(X)
    pca = PCA(n_components= num_features)
    principalComponents = pca.fit_transform(X)
    principalDf = pd.DataFrame(data = principalComponents)
    
    X_train, X_test, y_train, y_test = train_test_split(principalDf, y, shuffle = False)
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    pred = lm.predict(X_test)
    print("OSR^2: " + str(metrics.r2_score(pred, y_test)))
    
    X2 = sm.add_constant(X_train)
    est = sm.OLS(y_train.values, X2)
    est2 = est.fit()

    print(est2.summary())
    
    for i,x in enumerate(pca.components_):
        print("Feature " + str(i))
        top = x.argsort()[-5:][::-1]
        table = [list(just_rr.columns[top]), x[top]]
        df = pd.DataFrame(np.transpose(table))
        print(df)

In [10]:
def nonlinearmodels(brand, n_iter = 10):
    print("---------")
    print(brand)
    cart = DecisionTreeRegressor()
    randomforest = RandomForestRegressor()
    xgboost = XGBRegressor()
    
    just_rr = pd.read_csv(onedrive_path + "Data/Combined/relative_rr_data_w_target_encodings_" + brand +".csv")
    just_rr = just_rr.loc[just_rr["year"] >= YEAR_MIN].drop("Unnamed: 0", axis = 1).sort_values(["month", "year"])
    
    just_rr = just_rr.loc[just_rr["demand"] > 0]
    just_rr = just_rr.loc[just_rr["percent_neutral"] + just_rr["percent_positive"] + just_rr["percent_negative"] > 0]
    
    X= just_rr
    y = just_rr.pop("demand_F1")
    X_demand = just_rr[['demand', 'demand_P2', 'demand_P1', 'avg_subcat_demand',
       'monthly_avg_demand', 'avg_12_month_demand',
       'month', 'year']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)
    X_demand_train, X_demand_est, y_demand_train, y_demand_test = train_test_split(X_demand, y, shuffle = False)
    
    params = {"CART": {'max_depth': [3, 4, 5, 6],
                       'min_samples_leaf': [0.04, 0.06, 0.08],
                       'max_features': [0.2, 0.4, 0.6, 0.8]},
             "RF": {'bootstrap': [True, False],
                    'max_depth': [5, 10, 30, 40, None],
                    'max_features': ['auto', 'sqrt'],
                    'min_samples_leaf': [1, 2, 4],
                    'min_samples_split': [2, 5, 10],
                    'n_estimators': [50, 100, 300]},
             "XGB":{'min_child_weight': np.arange(1, 20),
                    'gamma': np.arange(1, 6),
                    'subsample': [0.8, 1.0],
                    'alpha': [0.5, 1, 2, 5],
                    'colsample_bytree': [0.6, 0.8, 1.0],
                    'max_depth': [3, 4, 5]}}
    
    cart_cv = RandomizedSearchCV(cart, param_distributions=params["CART"], n_iter= n_iter, n_jobs=1, cv=3, error_score= "raise")
    print("CART")
    cart_cv.fit(X_train, y_train)
    print("Demand + RR: ", cart_cv.best_score_)
    cart_cv.fit(X_demand_train, y_demand_train)
    print("Demand Only: ", cart_cv.best_score_)
    
    
    rf_cv = RandomizedSearchCV(randomforest, param_distributions=params["RF"], n_iter=n_iter, n_jobs=1, cv=3, error_score= "raise")
    print("RF")
    rf_cv.fit(X_train, y_train)
    print("Demand + RR: ", rf_cv.best_score_)
    rf_cv.fit(X_demand_train, y_demand_train)
    print("Demand Only: ", rf_cv.best_score_)
    
    xgb_cv = RandomizedSearchCV(xgboost, param_distributions=params["XGB"], n_iter=n_iter, n_jobs=1, cv=3, error_score= "raise")
    print("XGB")
    xgb_cv.fit(X_train, y_train)
    print("Demand + RR: ", xgb_cv.best_score_)
    xgb_cv.fit(X_demand_train, y_demand_train)
    print("Demand Only: ", xgb_cv.best_score_)

### Try models with lots of features

In [11]:
for brand in BRANDS:
    build_rr_demand_model(brand)
    build_rr_only_model(brand)
    build_rr_demand_pca_model(brand, num_features = 5)

RR & Demand Model, Aveda
OSR^2: -0.21933450085030648
                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     8.308
Date:                Tue, 07 Jul 2020   Prob (F-statistic):           3.91e-12
Time:                        09:50:26   Log-Likelihood:                -1521.7
No. Observations:                 132   AIC:                             3073.
Df Residuals:                     117   BIC:                             3117.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------

OSR^2: 0.7376882282692364
                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.823
Method:                 Least Squares   F-statistic:                     260.8
Date:                Tue, 07 Jul 2020   Prob (F-statistic):          3.67e-280
Time:                        09:50:27   Log-Likelihood:                -9073.6
No. Observations:                 782   AIC:                         1.818e+04
Df Residuals:                     767   BIC:                         1.825e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const   

OSR^2: -17.653089076172495
                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                 -0.023
Method:                 Least Squares   F-statistic:                    0.3478
Date:                Tue, 07 Jul 2020   Prob (F-statistic):              0.946
Time:                        09:50:27   Log-Likelihood:                -2778.4
No. Observations:                 231   AIC:                             5575.
Df Residuals:                     222   BIC:                             5606.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const        

strong multicollinearity problems or that the design matrix is singular.
RR & Demand PCA Model, Origins
OSR^2: 0.5924469428777159
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.490
Model:                            OLS   Adj. R-squared:                  0.481
Method:                 Least Squares   F-statistic:                     55.40
Date:                Tue, 07 Jul 2020   Prob (F-statistic):           3.31e-40
Time:                        09:50:27   Log-Likelihood:                -3490.3
No. Observations:                 294   AIC:                             6993.
Df Residuals:                     288   BIC:                             7015.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-

### Try just adding percent negative and percent neutral
#### These seemed to have a shot at working

In [12]:
for brand in BRANDS:
    build_demand_selected_rr_model(brand, rr_cols = [])
    build_demand_selected_rr_model(brand, rr_cols = ["percent_negative", "percent_neutral"])

Demand Model, Aveda
OSR^2: -0.30461383773332296
                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.476
Model:                            OLS   Adj. R-squared:                  0.442
Method:                 Least Squares   F-statistic:                     13.99
Date:                Tue, 07 Jul 2020   Prob (F-statistic):           2.54e-14
Time:                        09:50:27   Log-Likelihood:                -1524.6
No. Observations:                 132   AIC:                             3067.
Df Residuals:                     123   BIC:                             3093.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.825
Model:                            OLS   Adj. R-squared:                  0.823
Method:                 Least Squares   F-statistic:                     363.4
Date:                Tue, 07 Jul 2020   Prob (F-statistic):          7.27e-284
Time:                        09:50:27   Log-Likelihood:                -9076.7
No. Observations:                 782   AIC:                         1.818e+04
Df Residuals:                     771   BIC:                         1.823e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                3.357e+06   2

OSR^2: 0.8457063041658596
                            OLS Regression Results                            
Dep. Variable:              demand_F1   R-squared:                       0.822
Model:                            OLS   Adj. R-squared:                  0.820
Method:                 Least Squares   F-statistic:                     396.3
Date:                Tue, 07 Jul 2020   Prob (F-statistic):          4.06e-313
Time:                        09:50:28   Log-Likelihood:                -11068.
No. Observations:                 867   AIC:                         2.216e+04
Df Residuals:                     856   BIC:                         2.221e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const   

## Try Automatic Feature Selection

In [13]:
for brand in BRANDS:
    elasticnet(brand, alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1])

Feature Selection, Aveda
OSR^2: -0.30661412050664394
[ 4.34760216e-01 -1.52465210e-01 -8.25921579e-02  6.34713550e-01
 -4.52095748e-01 -2.35459700e-01  8.93706666e+02 -3.90187597e+02
  1.10712519e+03  2.87100589e+02 -2.15972474e+02 -3.72064068e+02
 -8.05189240e+02  1.16398229e+03 -7.35969069e+02 -4.27013222e+02]
Feature Selection, Clinique
OSR^2: 0.8922942150485393
[ 1.72207030e-01  6.10764891e-01  8.79098497e-02  1.34293864e-01
 -8.58233856e-02 -1.04155851e+00  1.16155403e+04  3.22866811e+03
 -6.20251158e+03 -1.40443229e+01 -8.40681081e+02 -5.29233935e+02
  7.58947092e+03 -7.32340688e+02 -2.64843693e+03  3.38177761e+03]
Feature Selection, Estée Lauder
OSR^2: 0.737977253379676
[ 2.21631556e-01 -7.48105550e-02  1.21024190e-02  8.15124410e-01
 -9.49982635e-02 -1.06454994e+00  1.39822330e+03 -1.73969769e+03
 -7.24576319e+03  1.78794502e+04 -1.26235716e+03 -4.85353318e+03
 -4.51479671e+03 -7.60743665e+03 -5.66947894e+02  8.17538455e+03]
Feature Selection, Jo Malone London
OSR^2: -6.2826256

### Try Non-linear models

In [17]:
for brand in BRANDS:
    nonlinearmodels(brand, n_iter = 30)

---------
Aveda
CART
Demand + RR:  0.43995524295043426
Demand Only:  0.4802006467634931
RF
Demand + RR:  0.45441908657453106
Demand Only:  0.4755993491031465
XGB
Demand + RR:  0.3898329239243152
Demand Only:  0.34859338656192357
---------
Clinique
CART
Demand + RR:  0.782879037654471
Demand Only:  0.7826055758573812
RF
Demand + RR:  0.8354958824487597
Demand Only:  0.8262509480250988
XGB
Demand + RR:  0.7847532095816933
Demand Only:  0.798056012859436
---------
Estée Lauder
CART
Demand + RR:  0.7305648512606188
Demand Only:  0.7330224558541811
RF
Demand + RR:  0.7817171556901586
Demand Only:  0.7668470980998249
XGB
Demand + RR:  0.7797086792244149
Demand Only:  0.7594372051677603
---------
Jo Malone London
CART
Demand + RR:  0.5381334526534544
Demand Only:  -0.3752366021642781
RF
Demand + RR:  0.3744436010240802
Demand Only:  -0.0953089946808467
XGB
Demand + RR:  0.4032038756765696
Demand Only:  -0.13536205533040627
---------
La Mer
CART
Demand + RR:  0.18768942600704422
Demand Only:  

#### To Dos
Remove colinearity

More PCA

Diff and diff

Check without parameters