In [1]:
import pandas as pd
import numpy as np
import copy as cp
import sklearn.metrics as metrics
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [3]:
from heatmap import heatmap, corrplot

## Load and Merge Data

In [4]:
onedrive_path = "C:/Users/cfowle/The Estée Lauder Companies Inc/TeamAnis - General/"

In [5]:
reviews = pd.read_csv(onedrive_path + "Data/Ratings and Reviews/reviews_demand_subcat.csv")
cc      = pd.read_csv(onedrive_path + "Data/Consumer Care/cc_product_month.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
products = pd.read_csv(onedrive_path + "Data/Product/product_codes.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
print(len(reviews))
reviews = reviews.drop(["item_description"], axis = 1).drop_duplicates()
print(len(reviews))

136630
122187


In [8]:
cc = cc.rename(columns = {"Date Month": "date", "P4": "itemid_4", "Brand Clean": "elc_brand"})
products = products.rename(columns = {"P4": "itemid_4", "brand": "elc_brand", "SubCategory":"sub_category"})
products = products[["elc_brand", "itemid_4", 'Major Category', 'Application', 'Category',
       'sub_category']].drop_duplicates(["elc_brand", "itemid_4"])

In [9]:
cc_product = cc.merge(products, how = "left", on = ["itemid_4", "elc_brand"])

In [10]:
cc_reviews_product = reviews.merge(cc_product, how = "left").fillna(0)
cc_reviews_product = cc_reviews_product.groupby(["elc_brand", "date", "sub_category"]).sum().reset_index()

In [11]:
cc_reviews_product["month"] = [x[-2:] for x in cc_reviews_product["date"].values]
cc_reviews_product["year"]  = [x[:4] for x in cc_reviews_product["date"].values]

In [46]:
cc_reviews_product = cc_reviews_product.sort_values("date", ascending = False).reset_index(drop=True)

In [47]:
cc_reviews_product

Unnamed: 0,elc_brand,date,sub_category,major_category_id,avg_nb_statements,nb_reviews,rating,rating_1,rating_2,rating_3,...,Questions_Product_Performance,Questions_Product_Usage Related,Questions_Service_Location,Questions_Service_Miscellaneous,Questions_Website_Offer Codes,Questions_Website_Website functionality,Suggestions_Suggestion_Miscellaneous,month,year,cc_tot
0,Tom Ford Beauty,2020-04,Volumizing,510.0,6.000000,2,5.000000,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,04,2020,0.0
1,Clinique,2020-04,All Other Face Makeu,510.0,0.857143,2,1.428571,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,04,2020,0.0
2,Clinique,2020-04,All Lipsticks,12750.0,19.951847,139,23.028625,13,4,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,04,2020,0.0
3,Clinique,2020-04,All Other Mascara,3060.0,7.076885,21,4.551587,7,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,04,2020,0.0
4,Clinique,2020-04,All Other Serums & E,3120.0,9.452381,16,9.728571,3,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,04,2020,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,Origins,2014-12,All Oil / Shine Cont,520.0,0.000000,0,0.000000,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,2014,0.0
29571,Origins,2014-12,All Other Acne Targe,520.0,0.000000,0,0.000000,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,2014,0.0
29572,By Kilian,2014-12,All Lipsticks,510.0,0.000000,0,0.000000,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,2014,0.0
29573,Origins,2014-12,All Other Bath,1180.0,0.000000,0,0.000000,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,2014,0.0


### Create dataset without customer data

In [69]:
non_customer_data = cc_reviews_product[["demand_F1","sub_category", "elc_brand", "month", "year"]]

### Create Dataset with only Relative Measures

In [70]:
relative_data = cp.deepcopy(non_customer_data)
subcat = cc_reviews_product ##just here so I can copy some old code easily

In [71]:
relative_data["percent_1"] = subcat["rating_1"]/subcat["nb_reviews"]
relative_data["percent_2"] = subcat["rating_2"]/subcat["nb_reviews"]
relative_data["percent_3"] = subcat["rating_3"]/subcat["nb_reviews"]
relative_data["percent_4"] = subcat["rating_4"]/subcat["nb_reviews"]
relative_data["percent_5"] = subcat["rating_5"]/subcat["nb_reviews"]

In [72]:
relative_data["percent_negative"] = subcat['sentiment_negative']/subcat["nb_reviews"]
relative_data["percent_neutral"] = subcat['sentiment_neutral']/subcat["nb_reviews"]
relative_data["percent_positive"] = subcat['sentiment_positive']/subcat["nb_reviews"]

In [73]:
relative_data = relative_data.fillna(0)
relative_data

Unnamed: 0,demand_F1,sub_category,elc_brand,month,year,percent_1,percent_2,percent_3,percent_4,percent_5,percent_negative,percent_neutral,percent_positive
0,405.0,Volumizing,Tom Ford Beauty,04,2020,0.000000,0.000000,0.000000,0.000000,1.000000,0.071429,0.100000,0.828571
1,3466.0,All Other Face Makeu,Clinique,04,2020,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.333333,0.666667
2,79964.0,All Lipsticks,Clinique,04,2020,0.093525,0.028777,0.079137,0.043165,0.755396,0.091796,0.147142,0.761062
3,28326.0,All Other Mascara,Clinique,04,2020,0.333333,0.047619,0.047619,0.095238,0.476190,0.167914,0.162283,0.669803
4,4683.0,All Other Serums & E,Clinique,04,2020,0.187500,0.062500,0.000000,0.000000,0.750000,0.109375,0.095833,0.794792
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,420.0,All Oil / Shine Cont,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
29571,7148.0,All Other Acne Targe,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
29572,0.0,All Lipsticks,By Kilian,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
29573,0.0,All Other Bath,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [74]:
full_relative_data = cp.deepcopy(relative_data)

In [75]:
subcat["cc_tot"] = subcat['Complaints'] + subcat['Compliments'] + subcat['Questions'] + subcat['Suggestions']

In [76]:
cc_counts = cc.drop(['Product Code', 'itemid_4', 'elc_brand', "date"], axis = 1).columns

In [77]:
for col in cc_counts:
    full_relative_data["percent_" + col] = subcat[col]/subcat["cc_tot"]

In [78]:
full_relative_data = full_relative_data.fillna(0)

In [79]:
full_relative_data

Unnamed: 0,demand_F1,sub_category,elc_brand,month,year,percent_1,percent_2,percent_3,percent_4,percent_5,...,percent_Questions_Order_Returns,percent_Questions_Product_General,percent_Questions_Product_Packaging,percent_Questions_Product_Performance,percent_Questions_Product_Usage Related,percent_Questions_Service_Location,percent_Questions_Service_Miscellaneous,percent_Questions_Website_Offer Codes,percent_Questions_Website_Website functionality,percent_Suggestions_Suggestion_Miscellaneous
0,405.0,Volumizing,Tom Ford Beauty,04,2020,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3466.0,All Other Face Makeu,Clinique,04,2020,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,79964.0,All Lipsticks,Clinique,04,2020,0.093525,0.028777,0.079137,0.043165,0.755396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28326.0,All Other Mascara,Clinique,04,2020,0.333333,0.047619,0.047619,0.095238,0.476190,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4683.0,All Other Serums & E,Clinique,04,2020,0.187500,0.062500,0.000000,0.000000,0.750000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,420.0,All Oil / Shine Cont,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29571,7148.0,All Other Acne Targe,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29572,0.0,All Lipsticks,By Kilian,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29573,0.0,All Other Bath,Origins,12,2014,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Take Exponentially Smoothed Values of Variables

In [127]:
expon = pd.concat([
    relative_data[["month", "year", "elc_brand", "sub_category", "demand_F1"]],
    relative_data.groupby(["sub_category", "elc_brand"]).transform(lambda x: x.ewm(halflife=10).mean()).add_suffix('_transformed')],
    axis=1)

In [130]:
expon_clinique = expon.loc[expon["elc_brand"] == "Clinique"].drop(["month_transformed", "year_transformed", "demand_F1_transformed"], axis = 1)
expon_clinique = expon_clinique.replace([np.inf, -np.inf], np.nan).dropna()

In [131]:
expon_clinique

Unnamed: 0,month,year,elc_brand,sub_category,demand_F1,percent_1_transformed,percent_2_transformed,percent_3_transformed,percent_4_transformed,percent_5_transformed,percent_negative_transformed,percent_neutral_transformed,percent_positive_transformed
1,04,2020,Clinique,All Other Face Makeu,3466.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.333333,0.666667
2,04,2020,Clinique,All Lipsticks,79964.0,0.093525,0.028777,0.079137,0.043165,0.755396,0.091796,0.147142,0.761062
3,04,2020,Clinique,All Other Mascara,28326.0,0.333333,0.047619,0.047619,0.095238,0.476190,0.167914,0.162283,0.669803
4,04,2020,Clinique,All Other Serums & E,4683.0,0.187500,0.062500,0.000000,0.000000,0.750000,0.109375,0.095833,0.794792
5,04,2020,Clinique,All Other Sun- FACE,5625.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.250000,0.000000,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29545,12,2014,Clinique,All Cross Category S,1476.0,0.040341,0.019234,0.038393,0.051430,0.318957,0.021765,0.061458,0.385132
29549,12,2014,Clinique,After Sun,360.0,0.001228,0.007369,0.001228,0.000000,0.066835,0.009618,0.014583,0.052459
29559,12,2014,Clinique,All All Other Lips,0.0,0.020930,0.038784,0.023379,0.068379,0.212903,0.024938,0.041456,0.297980
29560,12,2014,Clinique,All All Other Eye Ma,2467.0,0.078860,0.053892,0.060899,0.119273,0.684300,0.055468,0.178959,0.762797


In [140]:
expon_lip = expon_clinique.loc[expon_clinique["sub_category"] == "All Lipsticks"]
X = expon_lip.drop(["elc_brand", "sub_category","demand_F1"], axis = 1)
y = expon_lip[["demand_F1"]]

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)

In [142]:
lm = LinearRegression()
lm.fit(X_train, y_train)

pred = lm.predict(X_test)
metrics.r2_score(pred, y_test)

-1.121481257915545

In [144]:
X2 = sm.add_constant(X_train).astype('float64')
est = sm.OLS(y_train.values.reshape(-1,1), X2.values)
est2 = est.fit()

print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.253
Method:                 Least Squares   F-statistic:                     2.988
Date:                Wed, 24 Jun 2020   Prob (F-statistic):             0.0104
Time:                        10:30:08   Log-Likelihood:                -700.24
No. Observations:                  48   AIC:                             1418.
Df Residuals:                      39   BIC:                             1435.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.015e+08   4.34e+08     -1.386      0.1

In [145]:
expon_lip

Unnamed: 0,month,year,elc_brand,sub_category,demand_F1,percent_1_transformed,percent_2_transformed,percent_3_transformed,percent_4_transformed,percent_5_transformed,percent_negative_transformed,percent_neutral_transformed,percent_positive_transformed
2,04,2020,Clinique,All Lipsticks,79964.0,0.093525,0.028777,0.079137,0.043165,0.755396,0.091796,0.147142,0.761062
902,03,2020,Clinique,All Lipsticks,46707.0,0.068480,0.052786,0.069315,0.083069,0.726349,0.079497,0.162085,0.758419
1093,02,2020,Clinique,All Lipsticks,476936.0,0.090907,0.055873,0.058949,0.112383,0.681887,0.097528,0.156949,0.745523
1592,01,2020,Clinique,All Lipsticks,2132035.0,0.123141,0.043644,0.071011,0.107734,0.654470,0.087471,0.169564,0.742965
2223,12,2019,Clinique,All Lipsticks,1137995.0,0.104785,0.040198,0.077639,0.109232,0.668146,0.081584,0.153425,0.764992
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27610,04,2015,Clinique,All Lipsticks,271227.0,0.055038,0.043440,0.074973,0.179018,0.647531,0.061492,0.136649,0.801858
27902,03,2015,Clinique,All Lipsticks,328076.0,0.053642,0.042832,0.076906,0.171546,0.655073,0.060506,0.137093,0.802401
28509,02,2015,Clinique,All Lipsticks,417970.0,0.054380,0.046491,0.076066,0.170851,0.652213,0.062570,0.133094,0.804336
28776,01,2015,Clinique,All Lipsticks,332090.0,0.050695,0.047576,0.077264,0.176214,0.648251,0.061966,0.133940,0.804094
