# <h2 align="center">Regressão Logística prevendo doenças</h2>

Data Scientist.: Dr.Eddy Giusepe Chirinos Isidro

Link de estudo:

* [Prever doenças com Dados de Framingham-EUA](https://www.linkedin.com/pulse/regress%25C3%25A3o-log%25C3%25ADstica-prevendo-doen%25C3%25A7as-gabriel-constantin/?trackingId=RsCzK4sjTmGnTp68NQS8wA%3D%3D)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("HeartDisease.csv", sep=",")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.shape

(4238, 16)

In [5]:
df.isna().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df.isna().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [9]:
X = df.iloc[:,0:14]
y = df.iloc[:,-1]

In [11]:
X

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4231,1,58,3.0,0,0.0,0.0,0,1,0,187.0,141.0,81.0,24.96,80.0
4232,1,68,1.0,0,0.0,0.0,0,1,0,176.0,168.0,97.0,23.14,60.0
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0


In [12]:
y

0       0
1       0
2       0
3       1
4       0
       ..
4231    0
4232    1
4233    1
4234    0
4237    0
Name: TenYearCHD, Length: 3656, dtype: int64

In [13]:
bestfeatures = SelectKBest(score_func=chi2, k=10)

fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)

dfcolumns = pd.DataFrame(X.columns)

In [14]:
dfscores

Unnamed: 0,0
0,17.12043
1,295.507761
2,7.679797
3,0.686913
4,156.567318
5,28.153003
6,8.497823
7,82.967184
8,31.027987
9,249.153078


In [15]:
dfcolumns

Unnamed: 0,0
0,male
1,age
2,education
3,currentSmoker
4,cigsPerDay
5,BPMeds
6,prevalentStroke
7,prevalentHyp
8,diabetes
9,totChol


In [17]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']

print(featureScores.nlargest(11,'Score'))

              Specs       Score
10            sysBP  669.506552
1               age  295.507761
9           totChol  249.153078
4        cigsPerDay  156.567318
11            diaBP  142.878574
7      prevalentHyp   82.967184
8          diabetes   31.027987
5            BPMeds   28.153003
0              male   17.120430
12              BMI   15.730717
6   prevalentStroke    8.497823


In [20]:
# Vamos organizar de forma decrescente: 
featureScores = featureScores.sort_values(by='Score', ascending=False)
featureScores

Unnamed: 0,Specs,Score
10,sysBP,669.506552
1,age,295.507761
9,totChol,249.153078
4,cigsPerDay,156.567318
11,diaBP,142.878574
7,prevalentHyp,82.967184
8,diabetes,31.027987
5,BPMeds,28.153003
0,male,17.12043
12,BMI,15.730717


In [21]:
# Vamos criar um novo dataframe contendo somente as features mais relevantes:
df = df[['sysBP', 'glucose','age','totChol','cigsPerDay','diaBP','prevalentHyp','diabetes','BPMeds','male','TenYearCHD']]

df.head()

Unnamed: 0,sysBP,glucose,age,totChol,cigsPerDay,diaBP,prevalentHyp,diabetes,BPMeds,male,TenYearCHD
0,106.0,77.0,39,195.0,0.0,70.0,0,0,0.0,1,0
1,121.0,76.0,46,250.0,0.0,81.0,0,0,0.0,0,0
2,127.5,70.0,48,245.0,20.0,80.0,0,0,0.0,1,0
3,150.0,103.0,61,225.0,30.0,95.0,1,0,0.0,0,1
4,130.0,85.0,46,285.0,23.0,84.0,0,0,0.0,0,0


In [63]:
X = df.iloc[:,0:9].values
y = df.iloc[:,-1].values

In [64]:
# Instanciaremos nosso modelo:
modelo = LogisticRegression()

In [65]:
# Treinamos o modelo:
modelo.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
# salvar modelo
import pickle

# salvar o modelo de Regressão Logistica
with open('modelo_regress_Log_local.pkl', 'wb') as file:
    pickle.dump(modelo, file)