In [27]:
import pandas as pd
import statsmodels.api as sm
# Open and read the Two-Year Demand Modeling file
df = pd.read_csv("Copy of Two-Year Demand Modeling.csv")
df

Unnamed: 0,id,s22,z122,z222,z322,z422,z522,z622,z722,z922,...,div21,wid21,don21,esp21,al21,dr21,oc21,ic21,iw21,ow21
0,22,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
1,24,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,0
2,29,2,0,0,0,0,0,0,1,0,...,1,0,0,1,1,0,0,0,0,0
3,30,11,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
4,31,12,0,0,0,1,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,3743,4,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
279,3746,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
280,3748,6,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281,3752,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# For Continuous Member...

In [28]:
# Subset the data to only include rows where mstat21 and mstat22 are both equal to 1
# where the member is a CONTINUOUS member from 2021 to 2022

CONTINUOUS_member_df = df[(df['mstat21'] == 1) & (df['mstat22'] == 1)]

# Run the regression on the subset of the data
X = CONTINUOUS_member_df[['s21', 'a21', 'fem21', 'hh21', 'full21', 'as21', 'bl21', 'wh21', 'marr21', 'div21', 'wid21', 'don21', 'al21', 'dr21', 'oc21', 'ic21', 'iw21', 'ow21']]
y = CONTINUOUS_member_df['s22']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())



                            OLS Regression Results                            
Dep. Variable:                    s22   R-squared:                       0.572
Model:                            OLS   Adj. R-squared:                  0.496
Method:                 Least Squares   F-statistic:                     7.501
Date:                Sat, 22 Apr 2023   Prob (F-statistic):           6.46e-12
Time:                        02:55:13   Log-Likelihood:                -510.71
No. Observations:                 120   AIC:                             1059.
Df Residuals:                     101   BIC:                             1112.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         20.6433     17.342      1.190      0.2

# For New Member...

In [29]:
# Subset the data to only include rows where mstat22 is equal to 1
# where the member is a NEW member in 2022 (regardless was/wasn't in 2021)

NEW_member_df = df[(df['mstat22'] == 1)]

# Run the regression on the subset of the data
X = NEW_member_df[['s21', 'a21', 'fem21', 'hh21', 'full21', 'as21', 'bl21', 'wh21', 'marr21', 'div21', 'wid21', 'don21', 'al21', 'dr21', 'oc21', 'ic21', 'iw21', 'ow21']]
y = NEW_member_df['s22']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    s22   R-squared:                       0.564
Model:                            OLS   Adj. R-squared:                  0.518
Method:                 Least Squares   F-statistic:                     12.20
Date:                Sat, 22 Apr 2023   Prob (F-statistic):           2.69e-22
Time:                        02:55:13   Log-Likelihood:                -783.35
No. Observations:                 189   AIC:                             1605.
Df Residuals:                     170   BIC:                             1666.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.7062      1.922      4.530      0.0

# Model of attrition of existing members

In [30]:
# The first line creates a new column 'Lt' and sets its value to 1 if both 's21' and 's22' are greater than 0, and sets it to 0 otherwise. 
# The astype(int) function is used to convert the resulting boolean values to integers (0 or 1).

# The second line sets the value of 'Lt' to 0 for the rows where 's21' is equal to 0. 
# This ensures that the value of 'Lt' is 0 for the cases where St=0.

df['Lt'] = ((df['s21'] > 0) & (df['s22'] > 0)).astype(int)
df.loc[df['s21'] == 0, 'Lt'] = 0

In [31]:
# create a subset of the data where St-1 > 0
subset = df[df['s21'] > 0]

# estimate the attrition model
import statsmodels.formula.api as smf
model = smf.logit(formula='Lt ~ s21 + a21 + fem21 + as21 + bl21 + wh21 + marr21 + div21 + wid21 + don21 + esp21 + al21 + dr21 + oc21 + ic21 + iw21', data=subset)
result = model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.497677
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                     Lt   No. Observations:                  211
Model:                          Logit   Df Residuals:                      194
Method:                           MLE   Df Model:                           16
Date:                Sat, 22 Apr 2023   Pseudo R-squ.:                  0.2168
Time:                        02:55:13   Log-Likelihood:                -105.01
converged:                       True   LL-Null:                       -134.07
Covariance Type:            nonrobust   LLR p-value:                 1.080e-06
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.4917      1.633     -1.526      0.127      -5.692       0.708
s21            0.0704      0.