In [223]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings(action='ignore')

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score


In [224]:
df = pd.read_csv('./diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [226]:
df.describe().T.round(2)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,100000.0,41.89,22.52,0.08,24.0,43.0,60.0,80.0
hypertension,100000.0,0.07,0.26,0.0,0.0,0.0,0.0,1.0
heart_disease,100000.0,0.04,0.19,0.0,0.0,0.0,0.0,1.0
bmi,100000.0,27.32,6.64,10.01,23.63,27.32,29.58,95.69
HbA1c_level,100000.0,5.53,1.07,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,100000.0,138.06,40.71,80.0,100.0,140.0,159.0,300.0
diabetes,100000.0,0.08,0.28,0.0,0.0,0.0,0.0,1.0


In [227]:
df['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [228]:
# Mapeamento dos valores
gender_map = {'Male': 0, 'Female': 1, 'Other': 2}

# Aplicando o mapeamento à coluna 'gender'
df['gender'] = df['gender'].map(gender_map)


In [229]:
df['gender'].value_counts()

gender
1    58552
0    41430
2       18
Name: count, dtype: int64

In [230]:
df['smoking_history'].value_counts()

smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

In [231]:
smoking_map = {
    'No Info': 0,        # Sem informação
    'never': 1,          # Nunca fumou
    'former': 2,         # Ex-fumante
    'not current': 2,    # Não é fumante atualmente (pode ser similar a "former")
    'ever': 3,           # Já fumou em algum momento (pode ser entre "former" e "current")
    'current': 4         # Fumante atual
}

df['smoking_history'] = df['smoking_history'].map(smoking_map)


In [232]:
df['smoking_history'].value_counts()

smoking_history
0    35816
1    35095
2    15799
4     9286
3     4004
Name: count, dtype: int64

In [233]:
df['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [234]:
# Separando as features (X) e a variável alvo (y)
X = df.drop(columns=['diabetes'], axis=1)
y = df['diabetes']

# Dividindo os dados em treino e teste
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                test_size=0.1,
                                                stratify=y,
                                                random_state=42)

# Aplicar SMOTE no conjunto de treino
smote = SMOTE(random_state=42)
Xtrain_SMOTE, ytrain_SMOTE = smote.fit_resample(Xtrain, ytrain)

# Aplicando a padronização
scaler = StandardScaler()

# Padronizando os dados de treino e teste
Xtrain_SMOTE = scaler.fit_transform(Xtrain_SMOTE)
Xtest = scaler.transform(Xtest)

# Criando o modelo de Regressão Logística com class_weight='balanced'
lr = LogisticRegression(class_weight='balanced', max_iter=500)

# Treinando o modelo com os dados padronizados
lr.fit(Xtrain_SMOTE, ytrain_SMOTE)

# Avaliando o modelo
train_score = lr.score(Xtrain_SMOTE, ytrain_SMOTE)
test_score = lr.score(Xtest, ytest)

print(f'Acurácia nos dados de treino: {train_score}')
print(f'Acurácia nos dados de teste: {test_score}')

# Gerar o relatório de classificação para os dados de teste
ypred = lr.predict(Xtest)
report = classification_report(ytest, ypred, target_names=['Non-Diabetic', 'Diabetic'])
print(report)

Acurácia nos dados de treino: 0.8931511839708561
Acurácia nos dados de teste: 0.8902
              precision    recall  f1-score   support

Non-Diabetic       0.99      0.89      0.94      9150
    Diabetic       0.43      0.86      0.57       850

    accuracy                           0.89     10000
   macro avg       0.71      0.88      0.75     10000
weighted avg       0.94      0.89      0.91     10000



In [235]:
def result_test(new_pred):
  result = lr.predict(new_pred)[0]
  result_prob = round(lr.predict_proba(new_pred)[:,1][0], 4)
  
  if result == 1:
    print(f'Resultado: {result} | Existe {result_prob*100}% chance de o(a) pasciente ser diabetico.')
  else:
    print(f'Resultado: {result} | Existe {result_prob*100}% chance de o(a) pasciente não ser diabetico.')

In [236]:
# Exemplo 1
new_pred1 = np.array([0, 45.0, 1, 0, 1, 30.50, 6.0, 150]).reshape(1, -1)

# Exemplo 2
new_pred2 = np.array([1, 55.0, 0, 1, 4, 28.70, 7.5, 170]).reshape(1, -1)

# Exemplo 3
new_pred3 = np.array([0, 35.0, 0, 0, 0, 24.60, 5.2, 110]).reshape(1, -1)

# Exemplo 4
new_pred4 = np.array([2, 25.0, 0, 0, 2, 26.50, 4.8, 95]).reshape(1, -1)

# Exemplo 5
new_pred5 = np.array([1, 65.0, 1, 1, 3, 29.80, 6.8, 210]).reshape(1, -1)


In [237]:
result_test(new_pred1)
print()
result_test(new_pred2)
print()
result_test(new_pred3)
print()
result_test(new_pred4)
print()
result_test(new_pred5)

Resultado: 1 | Existe 100.0% chance de o(a) pasciente ser diabetico.

Resultado: 1 | Existe 100.0% chance de o(a) pasciente ser diabetico.

Resultado: 1 | Existe 100.0% chance de o(a) pasciente ser diabetico.

Resultado: 1 | Existe 100.0% chance de o(a) pasciente ser diabetico.

Resultado: 1 | Existe 100.0% chance de o(a) pasciente ser diabetico.
