In [28]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Explorando os dados

In [20]:
datapaths = Path("../data/raw/scenario01").rglob("*.csv")

In [21]:
datapaths = list(datapaths)
datapaths

[PosixPath('../data/raw/scenario01/csv/medications.csv')]

In [35]:
def get_dataframe(scenario, dataframe, format = "pandas"):
    path = list(Path(rf"../data/raw/scenario0{scenario}").rglob(rf"*{dataframe}.csv"))
    if format == "pandas":
        return pd.read_csv(path[0])
    elif format == "dask":
        return dd.read_csv(path)
    else:
        raise ValueError(f"format should be one of ['pandas', 'dask'], got {format}")

In [36]:
medications = get_dataframe(1, "medications")
providers = get_dataframe(1, "providers")
payer_transitions = get_dataframe(1, "payer_transitions")
imaging_studies = get_dataframe(1, "imaging_studies")
supplies = get_dataframe(1, "supplies")
payers = get_dataframe(1, "payers")
claims = get_dataframe(1, "claims")
allergies = get_dataframe(1, "allergies")
procedures = get_dataframe(1, "procedures")
organizations = get_dataframe(1, "organizations")
conditions = get_dataframe(1, "conditions")
careplans = get_dataframe(1, "careplans")
encounters =get_dataframe(1, "encounters")
devices = get_dataframe(1, "devices")
immunizations = get_dataframe(1, "immunizations")
claims_transactions = get_dataframe(1, "claims_transactions")
patients = get_dataframe(1, "patients")
observations = get_dataframe(1, "observations")

In [39]:
patients.columns

Index(['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE', 'ETHNICITY',
       'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'ZIP',
       'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE'],
      dtype='object')

In [56]:
patients["DEATHDATE"].isna().mean()

0.8517887563884157

In [57]:
patients

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,7b3c738d-3f86-58e3-450e-4018521d192f,2021-08-11,,999-71-7790,,,,Svetlana462,O'Hara248,,...,Boston Massachusetts US,971 Ullrich Grove Suite 53,Boston,Massachusetts,Suffolk County,2135.0,42.304260,-71.045262,2.295091e+04,0.0000
1,c40c2c75-13c9-8e4a-047f-573ae1330157,2020-03-03,,999-49-5505,,,,Rosalyn434,Christiansen251,,...,Lawrence Massachusetts US,485 Senger Route Apt 34,Waltham,Massachusetts,Middlesex County,2451.0,42.340439,-71.206815,5.476973e+04,2624.1225
2,39e76039-522c-61f1-d961-e03dce5f0bb2,1998-11-21,2003-07-17,999-36-5991,,,,Lashawnda573,Daniel959,,...,North Reading Massachusetts US,944 Witting Passage,Everett,Massachusetts,Middlesex County,2149.0,42.412900,-71.009282,1.391483e+05,932.7450
3,ff981e00-4004-44b4-48ba-0bfa62440bb3,2009-07-29,,999-39-5303,,,,Ulysses632,Rice937,,...,Scituate Massachusetts US,414 Rempel Harbor,Cohasset,Massachusetts,Norfolk County,,42.214770,-70.814553,3.393595e+05,90.6000
4,648dbf91-4334-a7e2-2cfe-72e88dbd4c15,2000-12-04,,999-77-8366,S99996514,X32384432X,Mr.,Tory770,Harvey63,,...,Boston Massachusetts US,368 Pouros Ramp Apt 46,Haverhill,Massachusetts,Essex County,1832.0,42.779225,-71.055842,5.432674e+05,2173.6425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,c815ffa4-2917-2c09-1569-e90d5a89eeb2,1933-05-23,1948-05-19,999-84-8049,,,,Olen518,Hammes673,,...,Arlington Massachusetts US,541 Lakin Promenade,Northampton,Massachusetts,Hampshire County,1053.0,42.284637,-72.589066,4.060467e+04,7632.8415
1170,b7fda58e-8eb1-a2d0-8fbf-57afefce34c6,1933-05-23,2003-12-13,999-57-9348,S99975005,X56712306X,Mr.,Edward499,Cole117,,...,Montague Massachusetts US,498 Shields Trafficway,Northampton,Massachusetts,Hampshire County,,42.354777,-72.593976,1.164761e+06,163012.4570
1171,1b46a593-3b14-53dd-1d2b-b59866e09a18,1933-05-23,2021-08-23,999-98-9276,S99942794,X86239063X,Mr.,Antoine384,Brekke496,,...,Andover Massachusetts US,1093 McClure Village Apt 71,Northampton,Massachusetts,Hampshire County,1062.0,42.405194,-72.755758,5.698680e+05,151240.1710
1172,5eeeb1e8-73c4-f7fa-cbb6-be359624f38e,1933-05-23,1975-02-25,999-47-8049,S99934661,X24881672X,Mr.,Britt177,Breitenberg711,,...,Hopkinton Massachusetts US,572 Sipes Ramp Apt 94,Northampton,Massachusetts,Hampshire County,1053.0,42.288684,-72.671682,1.087981e+05,128412.6970


In [47]:
encounters.columns

Index(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER',
       'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST',
       'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE',
       'REASONDESCRIPTION'],
      dtype='object')

In [49]:
encounters

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,38058d5e-b387-3bf4-7e8e-652963121539,2021-08-11T21:01:16Z,2021-08-11T21:16:16Z,7b3c738d-3f86-58e3-450e-4018521d192f,5b73b984-a934-3cb2-b181-be8a4e7d6c49,74ae6031-4392-398b-b2ba-5661afd90b7d,047f6ec3-6215-35eb-9608-f9dda363a44c,wellness,410620009,Well child visit (procedure),136.80,347.38,0.0,,
1,7ce90b77-dcf4-7ef7-6f2e-222b8f8e53af,2021-09-15T21:01:16Z,2021-09-15T21:16:16Z,7b3c738d-3f86-58e3-450e-4018521d192f,5b73b984-a934-3cb2-b181-be8a4e7d6c49,74ae6031-4392-398b-b2ba-5661afd90b7d,047f6ec3-6215-35eb-9608-f9dda363a44c,wellness,410620009,Well child visit (procedure),136.80,272.80,0.0,,
2,3e712884-0b4b-a72a-691b-d81127ed15f7,2021-11-17T21:01:16Z,2021-11-17T21:16:16Z,7b3c738d-3f86-58e3-450e-4018521d192f,5b73b984-a934-3cb2-b181-be8a4e7d6c49,74ae6031-4392-398b-b2ba-5661afd90b7d,047f6ec3-6215-35eb-9608-f9dda363a44c,wellness,410620009,Well child visit (procedure),136.80,1400.05,0.0,,
3,ea214644-3c0a-a8fc-27de-ce4d61fb6707,2022-01-19T21:01:16Z,2022-01-19T21:16:16Z,7b3c738d-3f86-58e3-450e-4018521d192f,5b73b984-a934-3cb2-b181-be8a4e7d6c49,74ae6031-4392-398b-b2ba-5661afd90b7d,047f6ec3-6215-35eb-9608-f9dda363a44c,wellness,410620009,Well child visit (procedure),136.80,1213.88,0.0,,
4,fc30a3b9-b200-6424-206b-9a6bca68c575,2022-04-20T21:01:16Z,2022-04-20T21:16:16Z,7b3c738d-3f86-58e3-450e-4018521d192f,5b73b984-a934-3cb2-b181-be8a4e7d6c49,74ae6031-4392-398b-b2ba-5661afd90b7d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),136.80,816.80,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59476,7ae9b7bd-033b-8f49-06ef-9f6ee958541c,2021-10-05T12:03:18Z,2021-10-05T12:18:18Z,87770b59-ee92-450f-57fb-53fc8a07a0ef,49318f80-bd8b-3fc7-a096-ac43088b0c12,235e3ce9-e4ef-324f-8471-6ebf833243cc,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,390906007,Follow-up encounter,85.55,234.72,0.0,55822004.0,Hyperlipidemia
59477,c6a819f2-49e0-8f80-43fa-391952c8acc7,2022-03-01T12:03:18Z,2022-03-01T12:18:18Z,87770b59-ee92-450f-57fb-53fc8a07a0ef,8e9209b3-bf8b-3953-81d8-d51004897c4c,778038d5-ca48-3ae3-ba18-4195e63c13c3,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,162673000,General examination of patient (procedure),136.80,1344.39,0.0,,
59478,5741ca05-0507-9d45-9fe0-85ee82689ab6,2022-03-02T11:03:18Z,2022-03-02T11:18:18Z,87770b59-ee92-450f-57fb-53fc8a07a0ef,49318f80-bd8b-3fc7-a096-ac43088b0c12,235e3ce9-e4ef-324f-8471-6ebf833243cc,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,185345009,Encounter for symptom,85.55,85.55,0.0,444814009.0,Viral sinusitis (disorder)
59479,e2c926ff-a30d-ee81-b6f6-3d7a930a6552,2022-03-15T12:03:18Z,2022-03-15T12:18:18Z,87770b59-ee92-450f-57fb-53fc8a07a0ef,108ccece-277a-396f-8bf2-1527f74458eb,99d36afd-fb09-3bf0-b1d9-030228b53e37,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,urgentcare,702927004,Urgent care clinic (procedure),142.58,142.58,0.0,,


In [42]:
conditions.columns

Index(['START', 'STOP', 'PATIENT', 'ENCOUNTER', 'CODE', 'DESCRIPTION'], dtype='object')

In [50]:
conditions

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2021-12-26,2022-01-07,c40c2c75-13c9-8e4a-047f-573ae1330157,862287d1-a2eb-8116-de42-e76de6163260,195662009,Acute viral pharyngitis (disorder)
1,1999-05-12,,39e76039-522c-61f1-d961-e03dce5f0bb2,00681d7e-e9bb-392b-6ccb-209cf46e2ec3,128613002,Seizure disorder
2,1999-05-12,,39e76039-522c-61f1-d961-e03dce5f0bb2,00681d7e-e9bb-392b-6ccb-209cf46e2ec3,703151001,History of single seizure (situation)
3,2003-03-04,2003-03-17,39e76039-522c-61f1-d961-e03dce5f0bb2,7ac71954-b14b-f259-f032-bc07d5f4eaa6,43878008,Streptococcal sore throat (disorder)
4,2003-07-17,,39e76039-522c-61f1-d961-e03dce5f0bb2,705cabcc-6c94-1804-ee22-0e0b944ed62f,262574004,Bullet wound
...,...,...,...,...,...,...
36391,2020-02-18,2020-03-03,87770b59-ee92-450f-57fb-53fc8a07a0ef,0b742aee-7e0d-afe4-aa31-4b027c2c2330,73595000,Stress (finding)
36392,2020-07-21,2021-02-23,87770b59-ee92-450f-57fb-53fc8a07a0ef,17d8b6ac-b49d-4db4-a6b9-cd38aa6a3a7b,73595000,Stress (finding)
36393,2020-08-13,2020-08-21,87770b59-ee92-450f-57fb-53fc8a07a0ef,8d66c5f9-28c5-01b1-3e4d-eb6c8171fff3,10509002,Acute bronchitis (disorder)
36394,2021-05-26,2021-06-07,87770b59-ee92-450f-57fb-53fc8a07a0ef,7138c553-cbe2-03c7-c02e-bf16e49ab056,10509002,Acute bronchitis (disorder)


In [52]:
conditions["CODE"].astype(str).describe()

count         36396
unique          222
top       160903007
freq          12486
Name: CODE, dtype: object

In [72]:
finding_msk = conditions["DESCRIPTION"].str.contains(r"finding")
disorder_msk = conditions["DESCRIPTION"].str.contains(r"disorder")

In [88]:
condition_subset = conditions[~finding_msk]
cumulative_descriptions = (condition_subset["DESCRIPTION"].value_counts()[::-1].cumsum()/len(condition_subset))[::-1]


In [91]:
cumulative_descriptions

Viral sinusitis (disorder)                                  1.000000
Acute viral pharyngitis (disorder)                          0.855066
Acute bronchitis (disorder)                                 0.773023
Normal pregnancy                                            0.707530
Anemia (disorder)                                           0.653978
                                                              ...   
Attempted suicide - suffocation                             0.000591
Acute deep venous thrombosis (disorder)                     0.000473
Fracture of the vertebral column with spinal cord injury    0.000355
Primary malignant neoplasm of colon                         0.000236
Injury of kidney (disorder)                                 0.000118
Name: DESCRIPTION, Length: 174, dtype: float64

In [76]:
conditions[finding_msk]

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
8,2020-08-12,,ff981e00-4004-44b4-48ba-0bfa62440bb3,7151063e-a9ea-734f-76ec-6e64401835d1,162864005,Body mass index 30+ - obesity (finding)
10,2015-01-05,,648dbf91-4334-a7e2-2cfe-72e88dbd4c15,5cc85f4f-af8c-a469-18c4-89afa9413153,162864005,Body mass index 30+ - obesity (finding)
11,2018-01-22,,648dbf91-4334-a7e2-2cfe-72e88dbd4c15,4815d728-8cdd-24f3-acb3-1f7fb94d1356,408512008,Body mass index 40+ - severely obese (finding)
12,2018-01-22,,648dbf91-4334-a7e2-2cfe-72e88dbd4c15,4815d728-8cdd-24f3-acb3-1f7fb94d1356,160968000,Risk activity involvement (finding)
14,2019-01-09,,648dbf91-4334-a7e2-2cfe-72e88dbd4c15,16d338c5-15de-4737-f0f9-575c5650771c,278860009,Chronic low back pain (finding)
...,...,...,...,...,...,...
36387,2015-01-20,2016-01-26,87770b59-ee92-450f-57fb-53fc8a07a0ef,f3935dcb-812a-5dfe-83b7-279f993e1bae,424393004,Reports of violence in the environment (finding)
36388,2017-01-31,2019-02-12,87770b59-ee92-450f-57fb-53fc8a07a0ef,a385fec2-2e64-eaf2-c40f-1ab62217a5f7,73595000,Stress (finding)
36389,2018-02-06,2022-03-01,87770b59-ee92-450f-57fb-53fc8a07a0ef,edb43c1e-2848-dcf8-ca89-fb9b6a90dcd6,423315002,Limited social contact (finding)
36391,2020-02-18,2020-03-03,87770b59-ee92-450f-57fb-53fc8a07a0ef,0b742aee-7e0d-afe4-aa31-4b027c2c2330,73595000,Stress (finding)


# Principios para engenharia de features

1. uma entidade será representada por uma incidência de condição em um paciente
2. em cada fotografia do paciente dada uma condição, será calculada a diferença entre a ocorrencia da condição e a data de obito do paciente
3. em cada entidade, serão verificadas as condições anteriores e quantas vezes elas ocorreram historicamente e em uma janela predefinida de tempo