INGESTÃO DE DADOS

In [11]:
import pandas as pd
import os


# Define file paths and columns to use
file_paths = {
    'admissions': 'rawcsvs/ADMISSIONS.csv',
    'microbiology_events': 'rawcsvs/MICROBIOLOGYEVENTS.csv',
    'prescriptions': 'rawcsvs/PRESCRIPTIONS.csv',
    'labevents': 'rawcsvs/LABEVENTS.csv',
    'diagnoses_icd': 'rawcsvs/DIAGNOSES_ICD.csv',
    'procedures_icd': 'rawcsvs/PROCEDURES_ICD.csv',
    'patients': 'rawcsvs/PATIENTS.csv',
    'icustays': 'rawcsvs/ICUSTAYS.csv'
}

cols_to_use = {
    'admissions': ['subject_id','admittime', 'dischtime', 'ethnicity', 'admission_type'],
    'microbiology_events': ['row_id', 'subject_id',  'chartdate', 'charttime', 'spec_itemid', 'spec_type_desc', 
                            'org_itemid', 'org_name', 'isolate_num', 'ab_itemid', 'ab_name', 'dilution_text', 
                            'dilution_comparison', 'dilution_value', 'interpretation'],
    'prescriptions': ['subject_id', "hadm_id", 'drug', 'enddate'],
    'labevents': ['subject_id', 'itemid', 'valuenum', 'valueuom', 'flag'],
    'diagnoses_icd': ['subject_id', 'icd9_code',  'seq_num'],
    'procedures_icd': ['subject_id', 'icd9_code',  'seq_num'],
    'patients': ['subject_id', 'gender', 'dob' ,'dod'],
    'icustays': ['subject_id', 'los', 'first_careunit','last_careunit']
}

files = {}

def both_na(row):
    return pd.isna(row['subject_id']) and pd.isna(row['hadm_id'])
for file_path in file_paths:
    files[file_path] = pd.read_csv(file_paths[file_path], usecols=cols_to_use[file_path])
    
for file_path, file in files.items(): 
    mask = file.apply(both_na, axis=1)  
    file.drop(file[mask].index, inplace=True)  
    file.drop_duplicates(inplace=True)  
    file_name = os.path.basename(file_path)
    file.to_parquet(f'cleanedparquets/{file_name}.parquet', index=False, engine='pyarrow', compression='zstd')

In [None]:
import os
for file_name, path in paths.items():
    df = pd.read_csv(path)
    print(f"Columns in {file_name}: {df.columns.tolist()}")

In [10]:
import dask.dataframe as dd

# Read the Parquet files
prevmerge = dd.read_parquet('merges/admin-micro-diag-icu.parquet')
newmerge = dd.read_parquet('cleanedparquets/labevents.parquet')

# Set indices
prevmerge = prevmerge.set_index('subject_id', sorted=True)
newmerge = newmerge.set_index('subject_id', sorted=True)

# Persist data in memory to optimize performance
prevmerge = prevmerge.persist()
newmerge = newmerge.persist()

# Get the number of partitions
npartitions = prevmerge.npartitions

processpertime = 1000000

# Process all partitions fully, 500 rows at a time
for partition_idx in range(npartitions):
    prev_partition = prevmerge.partitions[partition_idx].compute()
    new_partition = newmerge.partitions[partition_idx].compute()
    
    # Process the partition in chunks of 500 rows
    for i in range(0, len(prev_partition), processpertime):
        prev_chunk = prev_partition.iloc[i:i+processpertime]
        new_chunk = new_partition.iloc[i:i+processpertime]
        
        # Merge these chunks
        merged_chunk = dd.merge(prev_chunk, new_chunk, how='inner', left_index=True, right_index=True)

        # Save the merged chunk
        merged_chunk.to_parquet(f'merges/admin-micro-diag-icu-lab.parquet', name_function = lambda x: f'part_{partition_idx}_{i}_', 
                                engine='pyarrow', compression='zstd')


KeyboardInterrupt: 

 16.135.738

In [None]:
import dask.dataframe as dd

dd.read_parquet('merges\part_0_admin-micro-diag-icu-lab-patients.parquet\part.0.parquet').count().compute()

TREINO DO MODELO PREDITIVO

In [None]:
import tensorflow as tf
import tensorflow_recommenders as trfs

SUGESTAO 

In [66]:
def drug_suggestion(org_name, resistance):
    if resistance == 'R':
        return "Sugestão de antibiótico alternativo"
    else:
        return "Sugestão de antibiótico padrão"

merged_data['sugestao'] = merged_data.apply(lambda x: drug_suggestion(x['org_name'], x['interpretation']), axis=1)


FERRAMENTA VALIDAÇÃO

In [None]:
from sklearn.metrics import accuracy_score

# Testar a ferramenta em um conjunto de dados de validação
resultados_reais = merged_data1['antibiotico_real']
sugestoes = merged_data1['sugestao']

acuracia = accuracy_score(resultados_reais, sugestoes)
print(f"Acurácia da ferramenta: {acuracia}")
