In [1]:
%load_ext autoreload
%autoreload 2

# `Logit` on Orders - A warm-up challenge (~1h)

## Select features

🎯 Let's figure out the impact of `wait_time` and `delay_vs_expected` on very `good/bad reviews`

👉 Using our `orders` training_set, we will run two `multivariate logistic regressions`:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

 

In [2]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

👉 Import your dataset:

In [3]:
from olist.order import Order
orders = Order().get_training_data(with_distance_seller_customer=True)

👉 Select in a list which features you want to use:

⚠️ Make sure you are not creating data leakage (i.e. selecting features that are derived from the target)

💡 To figure out the impact of `wait_time` and `delay_vs_expected` we need to control for the impact of other features, include in your list all features that may be relevant

In [4]:
features = ['dim_is_one_star','dim_is_five_star','wait_time', 'delay_vs_expected', 'price']

🕵🏻 Check the `multi-colinearity` of your features, using the `VIF index`.

* It shouldn't be too high (< 10 preferably) to ensure that we can trust the partial regression coefficents and their associated `p-values` 
* Do not forget to standardize your data ! 
    * A `VIF Analysis` is made by regressing a feature vs. the other features...
    * So you want to `remove the effect of scale` so that your features have an equal importance before running any linear regression!
    
    
📚 <a href="https://www.statisticshowto.com/variance-inflation-factor/">Statistics How To - Variance Inflation Factor</a>

📚  <a href="https://online.stat.psu.edu/stat462/node/180/">PennState - Detecting Multicollinearity Using Variance Inflation Factors</a>

⚖️ Standardizing:

In [5]:
def standardize(df, features):
    df_standardized = df.copy()
    for f in features:
        mu = df[f].mean()
        sigma = df[f].std()
        df_standardized[f] = df[f].map(lambda x: (x - mu) / sigma)
    return df_standardized

features_to_standardize = orders.select_dtypes(include = ["int64", "float64"]).columns.drop(["dim_is_one_star", "dim_is_five_star"])

orders_z = standardize(orders,  features_to_standardize)
orders_z

Unnamed: 0,order_id,wait_time,expected_wait_time,delay_vs_expected,order_status,dim_is_five_star,dim_is_one_star,review_score,number_of_products,number_of_sellers,price,freight_value,distance_seller_customer
0,e481f51cbdc54678b7cc49136f2d6af7,-0.431192,-0.934806,-0.161781,delivered,0,0,-0.121008,-0.264595,-0.112544,-0.513802,-0.652038,-0.979475
1,53cdb2fc8bc7dce0b6741e2150273451,0.134174,-0.524871,-0.161781,delivered,0,0,-0.121008,-0.264595,-0.112544,-0.086640,0.000467,0.429743
2,47770eb9100c2d0c44946d9cf07ec65d,-0.329907,0.330878,-0.161781,delivered,1,0,0.657133,-0.264595,-0.112544,0.111748,-0.164053,-0.145495
3,949d5b44dbf5de918fe9c16f97b45f8a,0.073540,0.279445,-0.161781,delivered,1,0,0.657133,-0.264595,-0.112544,-0.441525,0.206815,2.054621
4,ad21c59c0840e6cb83a9ceb5573f8159,-1.019535,-1.326297,-0.161781,delivered,1,0,0.657133,-0.264595,-0.112544,-0.562388,-0.652038,-0.959115
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95875,9c5dedf39a927c1b2549525ed64a053c,-0.454309,-0.587647,-0.161781,delivered,1,0,0.657133,-0.264595,-0.112544,-0.311513,-0.449408,-0.893033
95876,63943bddc261676b46f01ca7ac2f7bd8,1.023841,-0.031941,-0.161781,delivered,0,0,-0.121008,-0.264595,-0.112544,0.183977,-0.123156,-0.212797
95877,83c1379a015df1e13d02aae0204711ab,1.305780,0.758017,-0.161781,delivered,1,0,0.657133,-0.264595,-0.112544,0.333684,1.964490,0.617630
95878,11c177c8e97725db2631073c19f07b62,0.483664,1.524686,-0.161781,delivered,0,0,-1.677291,1.601605,-0.112544,1.075186,2.715522,-0.387558


👉 Run your VIF Analysis to analyze the potential multicolinearities:

In [7]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# the independent variables set
X = orders_z[features]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [vif(X.values, i) for i in range(len(X.columns))]
  
print(vif_data)

             feature       VIF
0    dim_is_one_star  1.105851
1   dim_is_five_star  1.025029
2          wait_time  2.052730
3  delay_vs_expected  1.996291
4              price  1.005060


## Logistic Regressions

👉 Fit two `Logistic Regression` models:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

`Logit 1️⃣`

In [8]:
logit1 = smf.logit(formula='dim_is_one_star ~ wait_time + delay_vs_expected + price', data=orders_z).fit()
logit1.summary()

Optimization terminated successfully.
         Current function value: 0.282751
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_one_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95868.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 27 Oct 2022",Pseudo R-squ.:,0.1161
Time:,12:16:28,Log-Likelihood:,-27108.0
converged:,True,LL-Null:,-30669.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.4023,0.012,-193.691,0.000,-2.427,-2.378
wait_time,0.5515,0.015,37.903,0.000,0.523,0.580
delay_vs_expected,0.3424,0.018,19.373,0.000,0.308,0.377
price,0.0894,0.009,9.591,0.000,0.071,0.108


`Logit 5️⃣`

In [9]:
logit5 = smf.logit(formula='dim_is_five_star ~ wait_time + delay_vs_expected + price', data=orders_z).fit()
logit5.summary()

Optimization terminated successfully.
         Current function value: 0.642678
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_five_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95868.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 27 Oct 2022",Pseudo R-squ.:,0.04941
Time:,12:16:29,Log-Likelihood:,-61615.0
converged:,True,LL-Null:,-64817.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3367,0.007,47.258,0.000,0.323,0.351
wait_time,-0.4406,0.010,-43.982,0.000,-0.460,-0.421
delay_vs_expected,-0.4927,0.023,-21.324,0.000,-0.538,-0.447
price,-0.0008,0.007,-0.123,0.902,-0.014,0.012


💡 It's time to analyse the results of these two logistic regressions:

- Interpret the partial coefficients in your own words.
- Check their statistical significances with `p-values`
- Do you notice any differences between `logit_one` and `logit_five` in terms of coefficient importances?

In [12]:
import numpy as np

print(np.exp(0.3424)) #one star
print(np.exp(0.4927)) #five star
# Take the absolute value, the minus sign tells it will have a decreasing impact on the target variable

1.4083235142689496
1.63672942890031


In [50]:
# Among the following sentences, store the ones that are true in the list below

a = "delay_vs_expected influences five_star ratings even more than one_star ratings"
b = "wait_time influences five_star ratings even more more than one_star"

your_answer = [a]

🧪 __Test your code__

In [51]:
from nbresult import ChallengeResult

result = ChallengeResult('logit',
    answers = your_answer
)
result.write()
print(result.check())


platform darwin -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /Users/florencetersier/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/florencetersier/code/FDLData/data-logit/tests
plugins: anyio-3.6.1, asyncio-0.19.0
asyncio: mode=strict
[1mcollecting ... [0mcollected 1 item

test_logit.py::TestLogit::test_question [32mPASSED[0m[32m                           [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/logit.pickle

[32mgit[39m commit -m [33m'Completed logit step'[39m

[32mgit[39m push origin master



<details>
    <summary>- <i>Explanations and advanced concepts </i> -</summary>


> _All other thing being equal, the `delay factor` tends to increase the chances of getting stripped of the 5-star even more so than it affect the chances of 1-star reviews. Probably because 1-stars are really targeting bad products themselves, not bad deliveries_
    
❗️ However, to be totally rigorous, we have to be **more careful when comparing coefficients from two different models**, because **they might not be based on similar populations**!
    We have 2 sub-populations here: (people who gave 1-stars; and people who gave 5-stars) and they may exhibit intrinsically different behavior patterns. It may well be that "happy-people" (who tends to give 5-stars easily) are less sensitive as "grumpy-people" (who shoot 1-stars like Lucky-Luke), when it comes to "delay", or "price"...

</details>


## Logistic vs. Linear ?

👉 Compare:
- the regression coefficients obtained from the `Logistic Regression `
- with the regression coefficients obtained through a `Linear Regression` 
- on `review_score`, using the same features. 

⚠️ Check that both sets of coefficients  tell  "the same story".

> YOUR ANSWER HERE

In [54]:
features_to_standardize = orders.select_dtypes(include = ["int64", "float64"]).columns.drop("review_score")

orders_z = standardize(orders,  features_to_standardize)
orders_z

Unnamed: 0,order_id,wait_time,expected_wait_time,delay_vs_expected,order_status,dim_is_five_star,dim_is_one_star,review_score,number_of_products,number_of_sellers,price,freight_value,distance_seller_customer
0,e481f51cbdc54678b7cc49136f2d6af7,-0.431192,-0.934806,-0.161781,delivered,-1.204841,-0.328964,4,-0.264595,-0.112544,-0.513802,-0.652038,-0.979475
1,53cdb2fc8bc7dce0b6741e2150273451,0.134174,-0.524871,-0.161781,delivered,-1.204841,-0.328964,4,-0.264595,-0.112544,-0.086640,0.000467,0.429743
2,47770eb9100c2d0c44946d9cf07ec65d,-0.329907,0.330878,-0.161781,delivered,0.829977,-0.328964,5,-0.264595,-0.112544,0.111748,-0.164053,-0.145495
3,949d5b44dbf5de918fe9c16f97b45f8a,0.073540,0.279445,-0.161781,delivered,0.829977,-0.328964,5,-0.264595,-0.112544,-0.441525,0.206815,2.054621
4,ad21c59c0840e6cb83a9ceb5573f8159,-1.019535,-1.326297,-0.161781,delivered,0.829977,-0.328964,5,-0.264595,-0.112544,-0.562388,-0.652038,-0.959115
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95875,9c5dedf39a927c1b2549525ed64a053c,-0.454309,-0.587647,-0.161781,delivered,0.829977,-0.328964,5,-0.264595,-0.112544,-0.311513,-0.449408,-0.893033
95876,63943bddc261676b46f01ca7ac2f7bd8,1.023841,-0.031941,-0.161781,delivered,-1.204841,-0.328964,4,-0.264595,-0.112544,0.183977,-0.123156,-0.212797
95877,83c1379a015df1e13d02aae0204711ab,1.305780,0.758017,-0.161781,delivered,0.829977,-0.328964,5,-0.264595,-0.112544,0.333684,1.964490,0.617630
95878,11c177c8e97725db2631073c19f07b62,0.483664,1.524686,-0.161781,delivered,-1.204841,-0.328964,2,1.601605,-0.112544,1.075186,2.715522,-0.387558


🏁 Congratulations! 

💾 Don't forget to commit and push your `logit.ipynb` notebook !

In [57]:
# Logit model
logit5.summary()

0,1,2,3
Dep. Variable:,dim_is_five_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95868.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 27 Oct 2022",Pseudo R-squ.:,0.04941
Time:,11:57:10,Log-Likelihood:,-61615.0
converged:,True,LL-Null:,-64817.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3367,0.007,47.258,0.000,0.323,0.351
wait_time,-0.4406,0.010,-43.982,0.000,-0.460,-0.421
delay_vs_expected,-0.4927,0.023,-21.324,0.000,-0.538,-0.447
price,-0.0008,0.007,-0.123,0.902,-0.014,0.012


In [58]:
# Linear regression model
linear_reg = smf.ols(formula='review_score ~ wait_time + delay_vs_expected + price', data=orders_z).fit()

linear_reg.summary()

0,1,2,3
Dep. Variable:,review_score,R-squared:,0.115
Model:,OLS,Adj. R-squared:,0.115
Method:,Least Squares,F-statistic:,4140.0
Date:,"Thu, 27 Oct 2022",Prob (F-statistic):,0.0
Time:,11:57:14,Log-Likelihood:,-154250.0
No. Observations:,95872,AIC:,308500.0
Df Residuals:,95868,BIC:,308500.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.1555,0.004,1064.082,0.000,4.148,4.163
wait_time,-0.3601,0.005,-65.495,0.000,-0.371,-0.349
delay_vs_expected,-0.0966,0.005,-17.595,0.000,-0.107,-0.086
price,-0.0227,0.004,-5.811,0.000,-0.030,-0.015

0,1,2,3
Omnibus:,19939.75,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40348.789
Skew:,-1.246,Prob(JB):,0.0
Kurtosis:,4.972,Cond. No.,2.4
