In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Atividade para trabalhar o pré-processamento dos dados.

Criação de modelo preditivo para diabetes e envio para verificação de peformance
no servidor.

@author: Aydano Machado <aydano.machado@gmail.com>
"""

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import requests

In [3]:
print('\n - Lendo o arquivo com o dataset sobre diabetes')
data = pd.read_csv('diabetes_dataset.csv')


 - Lendo o arquivo com o dataset sobre diabetes


<h1>Atributos do dataset:</h1>
<p><b>Pregnancies</b>: número de vezes grávida</p>
<p><b>Glucose</b>: concentração plasmática de glicose a 2 horas em um teste oral de tolerância à glicose</p>
<p><b>BloodPressure</b>: pressão arterial diastólica (mm Hg)</p>
<p><b>SkinThickness</b>: espessura da dobra da pele do tríceps (mm)</p>
<p><b>Insulin</b>: insulina sérica de 2 horas (mu U/ml)</p>
<p><b>BMI</b>: índice de massa corporal (peso em kg / (altura em m) ^ 2)</p>
<p><b>DiabetesPedigreeFunction</b>: função de pedigree do diabetes</p>
<p><b>Age</b>: idade (anos)</p>
<p><b>Outcome</b>: variável de classe (0 ou 1) para diabetes</p>

In [6]:
data.dropna().corr(method='spearman')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.106651,0.153842,0.020238,0.024915,-0.149254,-0.0162,0.632766,0.144517
Glucose,0.106651,1.0,0.180647,0.162918,0.686544,0.158235,0.083975,0.217383,0.480197
BloodPressure,0.153842,0.180647,1.0,0.164492,0.088954,0.174816,-0.039403,0.269746,0.048258
SkinThickness,0.020238,0.162918,0.164492,1.0,0.18982,0.667521,0.090249,0.215697,0.24785
Insulin,0.024915,0.686544,0.088954,0.18982,1.0,0.294094,0.153865,0.134011,0.388614
BMI,-0.149254,0.158235,0.174816,0.667521,0.294094,1.0,0.116361,0.077851,0.216644
DiabetesPedigreeFunction,-0.0162,0.083975,-0.039403,0.090249,0.153865,0.116361,1.0,0.088955,0.235171
Age,0.632766,0.217383,0.269746,0.215697,0.134011,0.077851,0.088955,1.0,0.325426
Outcome,0.144517,0.480197,0.048258,0.24785,0.388614,0.216644,0.235171,0.325426,1.0


In [7]:
data.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [8]:
data['BloodPressure'] = data['BloodPressure'].fillna(0)
data['Insulin'] = data['Insulin'].fillna(data['Insulin'].mean())
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean())
data['SkinThickness'] = data['SkinThickness'].fillna(data['SkinThickness'].mean())
data['BMI'] = data['BMI'].fillna(0)

In [165]:
#Tratamento de dados faltantes
from sklearn.model_selection import train_test_split #testar o modelo
from sklearn.impute import SimpleImputer             #tratar os valores NaN no df

# Criando X and y par ao algorítmo de aprendizagem de máquina.\
print(' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset')
# Caso queira modificar as colunas consideradas basta algera o array a seguir.
#feature_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
#                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

feature_cols = high_corr_cols

X = data[feature_cols]
y = data.Outcome

# separando os dados em treino e teste
X_train,x_test,Y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=0)

#imputer aplica o metodo de substituição dos valores NaN no df pelo metodo passado, por default é a média
imputer = SimpleImputer()

X_train = imputer.fit_transform(X_train,Y_train)
x_test = imputer.transform(x_test)

 - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset


In [166]:
# padronizando a escala dos valores

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train,Y_train)
x_test = scaler.transform(x_test)

# transformando np arrays em df para facil visualização
X_train = pd.DataFrame(X_train,columns=feature_cols)
x_test = pd.DataFrame(x_test,columns=feature_cols)

In [167]:
# Ciando o modelo preditivo para a base trabalhada
print(' - Criando modelo preditivo')
neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(X_train, Y_train)

 - Criando modelo preditivo


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [168]:
print(' - Aplicando modelo e enviando para o servidor')

data_app = pd.read_csv('diabetes_app.csv')
data_app = scaler.transform(data_app[feature_cols])

y_pred = neigh.predict(data_app)

 - Aplicando modelo e enviando para o servidor


In [69]:
# Enviando previsões realizadas com o modelo para o servidor

URL = "https://aydanomachado.com/mlclass/01_Preprocessing.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = "Andre Santos"

# json para ser enviado para o servidor
data = {'dev_key':DEV_KEY,
        'predictions':pd.Series(y_pred).to_json(orient='values')}

# Enviando requisição e salvando o objeto resposta
r = requests.post(url = URL, data = data)

# Extraindo e imprimindo o texto da resposta
pastebin_url = r.text
print(" - Resposta do servidor:\n", r.text, "\n")
'''

 - Resposta do servidor:
 {"status":"success","dev_key":"Andre Santos","accuracy":0.5714285714285714,"old_accuracy":0} 

