# Felipe Veloso Mod13

# Desafío 1: Preparar el ambiente de trabajo

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (10, 6)

In [9]:
df = pd.read_csv('southafricanheart.csv').drop('Unnamed: 0',axis=1)
df.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
4,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


# Desafío 2

In [17]:
df = pd.get_dummies(df)
df.drop('famhist_Present',axis=1)

Unnamed: 0,sbp,tobacco,ldl,adiposity,typea,obesity,alcohol,age,chd,famhist_Absent
0,160,12.00,5.73,23.11,49,25.30,97.20,52,1,0
1,144,0.01,4.41,28.61,55,28.87,2.06,63,1,1
2,118,0.08,3.48,32.28,52,29.14,3.81,46,0,0
3,170,7.50,6.41,38.03,51,31.99,24.26,58,1,0
4,134,13.60,3.50,27.78,60,25.99,57.34,49,1,0
5,132,6.20,6.47,36.21,62,30.77,14.14,45,0,0
6,142,4.05,3.38,16.20,59,20.81,2.62,38,0,1
7,114,4.08,4.59,14.60,62,23.11,6.72,58,1,0
8,114,0.00,3.83,19.40,49,24.86,2.49,29,0,0
9,132,0.00,5.80,30.96,69,30.11,0.00,53,1,0


In [22]:
m1_logit = smf.logit('chd ~ famhist_Absent',df).fit()

Optimization terminated successfully.
         Current function value: 0.608111
         Iterations 5


In [23]:
def concise_summary(mod,print_fit=True):
    fit = pd.DataFrame({'Statistics': mod.summary2().tables[0][0][2:],
                       'Value': mod.summary2().tables[0][3][2:]})
    estimates = pd.DataFrame(mod.summary2().tables[1].loc[:,'Coef.':'Std.Err.'])
    if print_fit is True:
        print("\nGoodnes of Fit statistics\n", fit)
    print("\nPoint Estimates\n\n", estimates)

In [24]:
concise_summary(m1_logit)


Goodnes of Fit statistics
           Statistics       Value
2              Date:    574.1655
3  No. Observations:     -280.95
4          Df Model:     -298.05
5      Df Residuals:  4.9371e-09
6         Converged:      1.0000
7    No. Iterations:            

Point Estimates

                        Coef.  Std.Err.
Intercept      -3.765751e-16  0.144338
famhist_Absent -1.168993e+00  0.203255


In [31]:
#si persona no tiene antecedentes familiares
estimate_y_sin_ant = m1_logit.params['Intercept']
print(estimate_y_sin_ant)

-3.765751256899389e-16


In [32]:
#si persona tiene antecedentes familiares
estimate_y_ant = m1_logit.params['Intercept'] + (m1_logit.params['famhist_Absent'])
print(estimate_y_ant)

-1.1689930854299087


In [33]:
def invlogit(x):
    return 1/(1+ np.exp(-x))
print (invlogit(estimate_y_ant) , invlogit(estimate_y_sin_ant))

0.23703703703703713 0.4999999999999999


a) la probabilidad de que un individuo con antecedentes tenga una enfermedad coronaria es del 50%
b) la prob de un individuo sin antecedentes es del 24%
c) la dif entre un individuo con o sin antecedentes del 26%

# Replicar el modelo smf.ols

In [37]:
m1_ols = smf.ols('chd ~ famhist_Absent', df).fit()
concise_summary(m1_ols)


Goodnes of Fit statistics
           Statistics     Value
2              Date:  601.4437
3  No. Observations:   -294.59
4          Df Model:     36.86
5      Df Residuals:  2.66e-09
6         R-squared:   0.21050

Point Estimates

                    Coef.  Std.Err.
Intercept       0.500000  0.033111
famhist_Absent -0.262963  0.043313


In [38]:
0.500000 / 0.033111

15.100721814502732

In [39]:
-0.262963 / 0.043313

-6.0712257289959135

# Estime el mismo modelo con LPM

In [42]:
concise_summary(m1_ols,print_fit=False)


Point Estimates

                    Coef.  Std.Err.
Intercept       0.500000  0.033111
famhist_Absent -0.262963  0.043313


# Desafío 3: Estimación completa

In [43]:
df.columns

Index(['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol',
       'age', 'chd', 'famhist_Absent', 'famhist_Present'],
      dtype='object')

In [44]:
all_reg = smf.ols('chd ~ sbp + tobacco+ldl+adiposity+typea+obesity+alcohol+age+famhist_Absent', df).fit()
all_reg.summary()

0,1,2,3
Dep. Variable:,chd,R-squared:,0.236
Model:,OLS,Adj. R-squared:,0.221
Method:,Least Squares,F-statistic:,15.51
Date:,"Tue, 25 Jun 2019",Prob (F-statistic):,3.92e-22
Time:,20:06:47,Log-Likelihood:,-250.21
No. Observations:,462,AIC:,520.4
Df Residuals:,452,BIC:,561.8
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.3346,0.210,-1.592,0.112,-0.748,0.079
sbp,0.0013,0.001,1.265,0.206,-0.001,0.003
tobacco,0.0166,0.005,3.412,0.001,0.007,0.026
ldl,0.0332,0.011,3.108,0.002,0.012,0.054
adiposity,0.0023,0.005,0.483,0.629,-0.007,0.012
typea,0.0061,0.002,2.986,0.003,0.002,0.010
obesity,-0.0112,0.007,-1.589,0.113,-0.025,0.003
alcohol,-0.0002,0.001,-0.285,0.776,-0.002,0.001
age,0.0068,0.002,3.445,0.001,0.003,0.011

0,1,2,3
Omnibus:,81.628,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.329
Skew:,0.397,Prob(JB):,4.28e-07
Kurtosis:,2.055,Cond. No.,1730.0


In [72]:
df_depurado = df[['sbp','tobacco','ldl','typea','obesity','age','famhist_Absent']]

In [70]:
m1_depurado = smf.logit('chd ~ tobacco+ldl+typea+age+famhist_Absent',df).fit()

Optimization terminated successfully.
         Current function value: 0.514811
         Iterations 6


In [71]:
concise_summary(m1_depurado)


Goodnes of Fit statistics
           Statistics       Value
2              Date:    512.4990
3  No. Observations:     -237.84
4          Df Model:     -298.05
5      Df Residuals:  2.5537e-24
6         Converged:      1.0000
7    No. Iterations:            

Point Estimates

                    Coef.  Std.Err.
Intercept      -5.538269  0.927815
tobacco         0.080375  0.025880
ldl             0.161992  0.054969
typea           0.037115  0.012167
age             0.050460  0.010206
famhist_Absent -0.908175  0.225758


#regresion logistica,  estadisticos de bondad, r^2(reg lineal), df residual,
b) al depurar se denota con el estadistico residual, este modelo tiene mejor ajuste en base a su residuo
c) algunos interceptos mejoran en un modelo mas acotado

# Desafío 4: Estimación de perfiles

In [83]:
tobacco_mean = df['tobacco'].mean()
ldl_mean = df['ldl'].mean()
typea_mean = df['typea'].mean()
age_mean = df['age'].mean()
famhist_mean = df['famhist_Absent'].mean()
ldl_max = df['ldl'].max()
ldl_min = df['ldl'].min()

estimate_y_depurado = m1_depurado.params['Intercept'] + m1_depurado.params['tobacco']*tobacco_mean +m1_depurado.params['ldl']*ldl_mean + m1_depurado.params['typea']*typea_mean +  m1_depurado.params['age']*age_mean + m1_depurado.params['famhist_Absent']*famhist_mean
estimate_y_ldl_max= m1_depurado.params['Intercept'] + m1_depurado.params['tobacco']*tobacco_mean +m1_depurado.params['ldl']*ldl_max + m1_depurado.params['typea']*typea_mean +  m1_depurado.params['age']*age_mean + m1_depurado.params['famhist_Absent']*famhist_mean
estimate_y_ldl_min = m1_depurado.params['Intercept'] + m1_depurado.params['tobacco']*tobacco_mean +m1_depurado.params['ldl']*ldl_min + m1_depurado.params['typea']*typea_mean +  m1_depurado.params['age']*age_mean + m1_depurado.params['famhist_Absent']*famhist_mean
print(estimate_y_depurado , estimate_y_ldl_max, estimate_y_ldl_min)

-0.8774365278532722 0.838002299485925 -1.4865776727590911


In [84]:
def invlogit(x):
    return 1/(1+ np.exp(-x))
print (invlogit(estimate_y_depurado), invlogit(estimate_y_ldl_max), invlogit(estimate_y_ldl_min))

0.2937092748158695 0.6980443104466211 0.18443595575404645


cuando se tiene altos niveles de ldl, las probabilidades es probable tener una enfermedad