In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# REGRESSION MODEL
# File to dataframe
df_exames = pd.read_csv('./datasets/exame_diabetes.csv')

In [None]:
# Visualize structure
df_exames.info()

In [4]:
# removing features that are not relevant to the model
df_exames.drop(columns=['id_paciente'], axis=1, inplace=True)

In [5]:
# using one-hot encoding to transform categorical data into numerical data
df_exames = pd.get_dummies(df_exames, columns=['genero'], dtype='int64')

In [None]:
# heat map: verify if two features are correlated, so I can remove one of them
# annot: show the correlation value in each cell
sns.heatmap(df_exames.corr(), vmin=-1, vmax=1, annot=True)

In [None]:
# heat map just with target feature (resultado) and its correlation with other features
# It is sorted by the correlation value with target feature
# cmap: choses the color in the graph
sns.heatmap(df_exames.corr()[['resultado']].sort_values(by='resultado', ascending=False),
            vmin=-1, vmax=1, annot=True, cmap='BrBG')

In [None]:
# Scatter Plot (Dispersion) with Distribution
# figsize: size of the figure
# diagonal: what to show in the diagonal of the graph
pd.plotting.scatter_matrix(df_exames, alpha=0.2, figsize=(6, 6), diagonal='kde')

In [None]:
# Creating a new feature from two or more to realize if this new feature have correlation
# IMC = weight (kg) / (height (m) ** 2)
df_exames['imc'] = df_exames['peso'] / ((df_exames['altura'] / 100) ** 2)