# Objetivo
___

Detectar logo nas primeiras horas de internação se o paciente necessitará de internação em UTI.

A análise proposta nesse estudo é usar a primeira janela de dados e verificar se com os primeiros exames é possível prever se o paciente precisará de UTI, pra saber se o paciente necessitou de internação em UTI foram pegos os dados da última janela "ICU" que diz se o paciente foi internado ou não.

# Importações

In [1]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,plot_confusion_matrix
import matplotlib.pyplot as plt

In [2]:
path = 'https://raw.githubusercontent.com/Ederson-Branco/Data_Science/main/Datasets/covid_19_sirio_libanes.csv'
dados_raw = pd.read_csv(path,sep=';')

# Tratamento dos dados

In [3]:
dados_raw.head()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
1,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2-4,0
2,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,4-6,0
3,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,,,,,-1.0,-1.0,6-12,0
4,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-238095238.0,-818181818.0,-389966555.0,407557994.0,-23046165.0,96774194.0,-242281528.0,-81443299.0,ABOVE_12,1


In [4]:
dados_raw.shape

(1925, 231)

In [5]:
dados_raw.isna().sum()

PATIENT_VISIT_IDENTIFIER        0
AGE_ABOVE65                     0
AGE_PERCENTIL                   0
GENDER                          0
DISEASE GROUPING 1              5
                             ... 
RESPIRATORY_RATE_DIFF_REL     748
TEMPERATURE_DIFF_REL          694
OXYGEN_SATURATION_DIFF_REL    686
WINDOW                          0
ICU                             0
Length: 231, dtype: int64

Preenchimento dos valores nulos

In [6]:
'''
Utilizado o método "bfill" pois o "ffill" irá pegar os dados do paciente anterior 
quando a primeira janela possuir valores nulos 
'''

dados_raw.iloc[1919:1922][['PATIENT_VISIT_IDENTIFIER','ALBUMIN_MEDIAN']]

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,ALBUMIN_MEDIAN
1919,383,210526316.0
1920,384,
1921,384,605263158.0


In [7]:
dados_bfill = dados_raw.copy()
dados_bfill.fillna(method='bfill',inplace=True) 
dados_bfill.iloc[1919:1922][['PATIENT_VISIT_IDENTIFIER','ALBUMIN_MEDIAN']]

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,ALBUMIN_MEDIAN
1919,383,210526316
1920,384,605263158
1921,384,605263158


Seleção dos dados, janela 0-2

In [8]:
dados = dados_bfill.query('WINDOW == "0-2"').reset_index().drop(columns='index')
dados

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,0
1,1,1,90th,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,1
2,2,0,10th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-095959596,-051552795,-0351327692,-0747001091,-0756272401,-1,-0961262106,0-2,0
3,3,0,40th,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,0
4,4,0,10th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0952380952,-097979798,-1,-0883668904,-0956805064,-0870967742,-0953536131,-0980333069,0-2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,380,0,40th,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,0
381,381,1,Above 90th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0619047619,-1,-1,-1,-1,-1,-0612627073,-1,0-2,0
382,382,0,50th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,0
383,383,0,40th,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1,-1,-1,-1,-1,-1,-1,-1,0-2,0


Seleção do target, onde a escolha foi o último dado por paciente do campo "ICU".

Conforme orientação do hospital Sírio-Líbanês os dados da janela "ABOVE_12" não devem ser usados no modelo, nesse caso foram pegos os dados da janela "6-12"

In [9]:
target = dados_raw.query('WINDOW == "6-12"')[['ICU']]
target = target.reset_index().drop(columns='index')
target

Unnamed: 0,ICU
0,0
1,1
2,0
3,0
4,0
...,...
380,0
381,0
382,0
383,0


In [10]:
frequencia = target.value_counts()
percentual = target.value_counts(normalize=True) * 100
dist_frequencia = pd.DataFrame({'Frequencia':frequencia,'Percentual (%)':percentual.round(2)})
dist_frequencia

AttributeError: 'DataFrame' object has no attribute 'value_counts'

In [None]:
frequencia.plot(kind='bar')
plt.title('Frequencia de internação em UTI')
plt.xticks(rotation=0)
plt.xlabel('Internação')
plt.show()

In [None]:
dados = dados.drop(columns=['WINDOW','ICU','AGE_PERCENTIL'])
dados['target'] = target
dados.set_index('PATIENT_VISIT_IDENTIFIER',inplace=True,drop=True)
dados

In [None]:
dados.info()

convertendo object para float

In [None]:
selecao = dados.select_dtypes(include='object').columns

for i in selecao:
  dados[i] = dados[i].str.replace(',','.').astype(float)


dados.info()

In [None]:
dados

# Separação dos dados para treino, teste e validação

In [None]:
X = dados.iloc[:300,0:-1]
y = dados['target'][0:300]
validacao = dados[300:]

# Seleção das features

Seleção automática através do feature_selection.RFECV

In [None]:
modelo = RandomForestClassifier(random_state=42,n_estimators=100)

In [None]:
treino_x, teste_x, treino_y, teste_y = train_test_split(X,y, random_state=42, test_size=0.3)

In [None]:
modelo.fit(treino_x,treino_y)

In [None]:
# modelo + RFECV
selecionador = RFECV(estimator=modelo,cv=5,step=1, scoring='accuracy')

# .fit RFECV
selecionador.fit(treino_x,treino_y)
treino_rfecv = selecionador.transform(treino_x)
teste_rfecv = selecionador.transform(teste_x)

# .fit modelo
modelo.fit(treino_rfecv,treino_y)

# Score
modelo.score(teste_rfecv,teste_y)

In [None]:
selecao = X.columns[selecionador.support_]

In [None]:
validacao = validacao[selecao]
dados = dados[selecao]

In [None]:
X.columns[selecionador.support_]

Colunas restantes após a seleção automática de features

In [None]:
dados.info()

In [None]:
X = dados.iloc[:300]

# Seleção do melhor modelo

In [None]:
X_treino,X_teste,y_treino,y_teste = train_test_split(X, y, random_state=42,test_size=0.3)

In [None]:
# Função para fazer o comparativo no resultado dos diferentes modelos

def verifica_acurácia(modelo):

  modelo.fit(X_treino,y_treino)
  predict = modelo.predict(X_teste)

  print(classification_report(y_teste,predict))

Para comparação foi utilizado um modelos svm, uma árvore de decisão e floresta aleatória
____

In [None]:
modelo_svm = SVC(random_state=42)
modelo_tree = DecisionTreeClassifier(random_state=42)
modelo_ensemble = RandomForestClassifier(random_state=42)

In [None]:
verifica_acurácia(modelo_tree)

In [None]:
verifica_acurácia(modelo_svm)

In [None]:
verifica_acurácia(modelo_ensemble)

# Validação

In [None]:
validacao

In [None]:
target[300:]

In [None]:
validacao_predict = modelo_ensemble.predict(validacao)

In [None]:
print(classification_report(target[300:],validacao_predict))

In [None]:
plot_confusion_matrix(modelo_ensemble,validacao,target[300:])

# Conclusão
____

Usando como base de estudo a primeira janela de internação e o último dado de ICU que nos diz se o paciente foi internado ou não,<br>
em conjunto com a função de seleção automática de features do scikit learn, obteve-se uma taxa de acerto de 75%.

Na minha opinião, por se tratar de pacientes, é necessário um novo estudo a fim de obter uma acurácia maior na predição de pacientes que necessitarão de internação em UTI