## Logistic Regression

In [1]:
import pandas as pd
import statsmodels.api as sm
import math
from sklearn.linear_model import LogisticRegression

## 1. Import Data

In [2]:
df=pd.read_csv('./data/groupon.csv')
print(len(df))
df.head()

710


Unnamed: 0,deal_id,start_date,min_req,treatment,prom_length,price,discount_pct,coupon_duration,featured,limited_supply,fb_likes,quantity_sold,revenue
0,heli-flights,9/23/2011,10,1,4,99,51,185,1,1,290,540,53460
1,gl-champion-series-tennis-electric-factory,9/23/2011,20,1,2,95,41,9,0,1,43,190,18050
2,realm-of-terror-horror-experience,9/23/2011,50,1,3,25,50,38,0,1,208,380,9500
3,newport-gourmet,9/23/2011,15,1,3,50,50,369,0,1,16,90,4500
4,the-clayroom,9/23/2011,20,1,4,25,52,185,0,1,85,580,14500


## 2. Choose (Dependent and Independent) Variables

In [3]:
# choose features for propensity score calculation
X = df[['prom_length', 'price', 'discount_pct', 'coupon_duration', 'featured', 'limited_supply']]
## hence @3. Matching Procedure, exclude=['min_req', 'start_date', 'fb_likes', 'quantity_sold', 'revenue']
y = df['treatment']

X.head()

Unnamed: 0,prom_length,price,discount_pct,coupon_duration,featured,limited_supply
0,4,99,51,185,1,1
1,2,95,41,9,0,1
2,3,25,50,38,0,1
3,3,50,50,369,0,1
4,4,25,52,185,0,1


## 3. Model for the Logistic Regression

In [4]:
lr = LogisticRegression()
lr.fit(X, y)

LogisticRegression()

In [5]:
# get the coefficients
coeffs=pd.DataFrame({
    'column':X.columns.to_numpy(),
    'coeff':lr.coef_.ravel().round(4),
    'p_value': sm.OLS(y, sm.add_constant(X)).fit().pvalues.drop(['const'])
})
coeffs

Unnamed: 0,column,coeff,p_value
prom_length,prom_length,-0.3284,1e-05
price,price,-0.0086,0.008066
discount_pct,discount_pct,-0.0079,0.527169
coupon_duration,coupon_duration,0.0035,1.8e-05
featured,featured,0.308,0.209979
limited_supply,limited_supply,-0.3665,0.040335


In [6]:
# prediction
pred_binary=lr.predict(X)
pred_prob=lr.predict_proba(X)

id=17
print('the binary prediction is:', pred_binary[17])
print('the corresponding probabilities are:', pred_prob[17].round(4))

the binary prediction is: 1
the corresponding probabilities are: [0.3963 0.6037]
