In [3]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [5]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv").dropna()
df.sex = df.sex.astype("int")

In [7]:
# informacao sobre quais tipos de dados estao presentes no dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 32.7 KB


In [8]:
# as primeiras cinco posições do dataset
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [9]:
df.describe()
# descreve os dados presentes nas colunas do dataset
# std e o desvio padrao, mean e a media, min e o menor valor, 25% e a media dos primeiros 25% dos valores, max e o maior valor

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [10]:
df.shape
# sao 299 valores em 13 colunas, ou 299 linhas e 13 colunas

(299, 13)

In [11]:
# quantos tipos unicos de dados existem para cada coluna do dataset
df.nunique()

age                          47
anaemia                       2
creatinine_phosphokinase    208
diabetes                      2
ejection_fraction            17
high_blood_pressure           2
platelets                   176
serum_creatinine             40
serum_sodium                 27
sex                           2
smoking                       2
time                        148
DEATH_EVENT                   2
dtype: int64

In [12]:
df.time.describe()

count    299.000000
mean     130.260870
std       77.614208
min        4.000000
25%       73.000000
50%      115.000000
75%      203.000000
max      285.000000
Name: time, dtype: float64

In [6]:
# relacao das idades dos pacientes 
fig = px.histogram(df, x = "age", color = "age", template = "plotly_dark",
                  labels = {"age": "Idade"})
fig.show()

#posicoes 187 e 190 do dataset apresentam idade 60.667

In [13]:
# quantidade de pacientes do sexo masculino e feminino
df["sex"].value_counts()

1    194
0    105
Name: sex, dtype: int64

In [7]:
# proporcao entre homens e mulheres que estavam com insuficiencia cardiaca
fig1 = px.pie(df, names = "sex", title = "Distribuição de Insuficiência Cardíaca por Sexo", template = "plotly_dark", color = "sex", 
              labels = {"sex": "Gender"})
fig1.update_layout(title_x = 0.5)
fig1.show()

In [14]:
# quantidade de pacientes que tem e nao tem diabetes
df["diabetes"].value_counts()

0    174
1    125
Name: diabetes, dtype: int64

In [11]:
# Proporcao entre pacientes que tem e não tem diabetes
df_agrupado7 = df.groupby("diabetes").mean().reset_index()
fig8 = px.pie(df, names = "diabetes", title = "Distribuição de Diabetes entre os Pacientes", template = "plotly_dark", color = "diabetes")
fig8.update_layout(title_x = 0.5)
fig8.show()

In [15]:
# quantidade de pacientes fumantes e nao fumantes
df["smoking"].value_counts()

0    203
1     96
Name: smoking, dtype: int64

In [12]:
# proporcao entre pacientes fumantes e nao fumantes
df_agrupado6 = df.groupby("smoking").mean().reset_index()
fig7 = px.pie(df, names = "smoking", title = "Distribuição de Fumantes entre os Pacientes", template = "plotly_dark", color = "smoking", )
fig7.update_layout(title_x = 0.5)
fig7.show()

In [8]:
# media da fracao de ejecao de acordo com a media das idades
df_agrupado = df.groupby("age").mean().reset_index()
fig2 = px.line(df_agrupado, x = "age", y = "ejection_fraction", template = "plotly_dark")
fig2.show()

In [9]:
# quantidade de media de sodio encontrada no sangue de acordo com a media das idades
df_agrupado2 = df.groupby("serum_sodium").mean().reset_index()
fig3 = px.line(df_agrupado, x = "age", y = "serum_sodium", template = "plotly_dark")
fig3.show()

In [10]:
# quantidade de media de plaquetas encontrada no sangue de acordo com a media das idades
df_agrupado3 = df.groupby("platelets").mean().reset_index()
fig4 = px.bar(df_agrupado, x = "platelets", y = "age",template = "plotly_dark", title = "Quantidade de Plaquetas por Idade",
              orientation = 'h', 
              color = "age", color_continuous_scale = "sunset")
fig4.update_layout(title_x = 0.5)
fig4.show()

In [14]:
# quantidade de media de creatina fosfoquinase encontrada no sangue de acordo com a media das idades
df_agrupado4 = df.groupby("serum_creatinine").mean().reset_index()
fig5 = px.line(df_agrupado, x = "age", y = "serum_creatinine",template = "plotly_dark", title = "Quantidade de Creatina Fosfoquinase por Idade")
fig5.update_layout(title_x = 0.5)
fig5.show()

In [15]:
# porcentagem de falecimentos entre pacientes de acordo com a idade
df_agrupado5 = df.groupby("DEATH_EVENT").mean().reset_index()
fig6 = px.bar(df_agrupado, x = "DEATH_EVENT", y = "age",template = "plotly_dark", title = "Porcentagem de Mortes por Idade", 
              color = "age", color_continuous_scale = "sunset")
fig6.update_layout(title_x = 0.5)
fig6.show()