INGESTÃO DE DADOS


In [20]:
import pandas as pd
import os

# Define file paths and columns to use
file_paths = {
    'admissions': 'rawcsvs/ADMISSIONS.csv',
    'microbiology_events': 'rawcsvs/MICROBIOLOGYEVENTS.csv',
    'prescriptions': 'rawcsvs/PRESCRIPTIONS.csv',
    'labevents': 'rawcsvs/LABEVENTS.csv',
    'diagnoses_icd': 'rawcsvs/DIAGNOSES_ICD.csv',
    'patients': 'rawcsvs/PATIENTS.csv',
    'transl-labitems': 'rawcsvs/D_LABITEMS.csv',
    'transl-diag': 'rawcsvs/D_ICD_DIAGNOSES.csv'
}

cols_to_use = {
    'admissions': ['subject_id','admittime','ethnicity', 'admission_type'],
    'microbiology_events': ['row_id', 'subject_id',  'chartdate', 'charttime', 'spec_itemid', 'spec_type_desc', 
                            'org_itemid', 'org_name', 'isolate_num', 'ab_itemid', 'ab_name', 'dilution_text', 
                            'dilution_comparison', 'dilution_value', 'interpretation'],
    'prescriptions': ['subject_id', "hadm_id", 'drug', 'enddate'],
    'labevents': ['subject_id', 'itemid', 'valuenum', 'valueuom', 'flag'],
    'diagnoses_icd': ['subject_id', 'icd9_code'], #gotta work on this one, adding all codes to one line in the array
    'transl-labitems': ['itemid', 'label'], # use this in conjunction with labevents to understando wtf if happening
    'patients': ['subject_id', 'gender'],
    'transl-diag': ['icd9_code', 'short_title', 'long_title']
}

files = {}
for file_path in file_paths:
    files[file_path] = pd.read_csv(file_paths[file_path], usecols=cols_to_use[file_path])
    
for file_path, file in files.items(): 
    if  not file_path.startswith('transl'):
        file.dropna(inplace=True, subset=['subject_id'])
        file.drop_duplicates(inplace=True)  
        file_name = os.path.basename(file_path)
        file.to_parquet(f'cleanedparquets/{file_name}.parquet', index=False, engine='pyarrow', compression='zstd')

EXTRACAO DE DADOS IMPORTANTES A BUSCA


In [2]:
import Levenshtein
#supunhetemos que tenhamos uma busca
search = 'aspirose'
#precisamos listar todas as doencas em que o medicamento foi usado
#para facilitar a busca, precisamos calcular o medicamento mais provavel que foi escrito
all_drugs = files['microbiology_events']['ab_name'].dropna().unique()

closest_drug = None	
for drug in all_drugs:
    dist = Levenshtein.distance(search, drug)
    if closest_drug is None or dist < closest_drug[1]:
        closest_drug = (drug, dist)

#agora que temos o medicamento mais provavel, podemos listar as resistencias



Coletando os dados de resistencia


In [63]:
import plotly.express as px

#need to sort he diseases
diseases = files['microbiology_events'][files['microbiology_events']['ab_name'] == closest_drug[0]]['org_name'].unique()

dis_res_count = {}
for disease in diseases:
    disease_res = files['microbiology_events'][
        (files['microbiology_events']['ab_name'] == closest_drug[0]) & 
        (files['microbiology_events']['org_name'] == disease)
    ]['interpretation']
    
    # Initialize counts for 'R', 'I', 'S'
    counts = {'R': 0, 'I': 0, 'S': 0}
    
    counts.update(disease_res.value_counts().to_dict())
    
    dis_res_count[disease] = counts

plot = px.bar(x=list(dis_res_count.keys()), y=[v['R'] for v in dis_res_count.values()])
plot.show()    





FODASSE O MODELO PREDITIVO


In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook_connected'  # Or 'notebook_connected' for offline support

#nao
#nao
plot = px.sunburst(all_events, path=['spec_type_desc', 'org_name', 'ab_name'], title='Eventos de microbiologia')
plot.show()

SUGESTAO


In [66]:
def drug_suggestion(org_name, resistance):
    if resistance == 'R':
        return "Sugestão de antibiótico alternativo"
    else:
        return "Sugestão de antibiótico padrão"

merged_data['sugestao'] = merged_data.apply(lambda x: drug_suggestion(x['org_name'], x['interpretation']), axis=1)


FERRAMENTA VALIDAÇÃO


In [None]:
from sklearn.metrics import accuracy_score

# Testar a ferramenta em um conjunto de dados de validação
resultados_reais = merged_data1['antibiotico_real']
sugestoes = merged_data1['sugestao']

acuracia = accuracy_score(resultados_reais, sugestoes)
print(f"Acurácia da ferramenta: {acuracia}")
