In [None]:
#Install Packages
%pip install boto3
%pip install pandas
%pip install awscli

import os
import awscli
import boto3
import time
import pandas as pd
import csv

%env region=us-east-1

#Create an S3 client for Python
s3_client = boto3.client(
    "s3",
    region_name=os.environ['region'],
)

#Create an Athena client for Python
athena = boto3.client(
    'athena', 
    region_name=os.environ['region'],
)

In [None]:
#Athena client query
athena_output_bucket = 's3://anelpere-duke-datathon-2024/team2/athena-output/'

response = athena.start_query_execution(
    QueryString="""
    SELECT category, text 
    FROM "mimiciii"."noteevents"
    WHERE category = 'Discharge summary'
    LIMIT 10
    """,
    QueryExecutionContext={
        'Database': 'mimiciii'
    },
    ResultConfiguration={
        'OutputLocation': athena_output_bucket
    }
)
while True:
    try:
        # This function only loads the first 1000 rows
        athena.get_query_results(
            QueryExecutionId=response["QueryExecutionId"]
        )
        break
    except Exception as err:
        if "not yet finished" in str(err):
            time.sleep(0.001)
        else:
            raise err

In [None]:
#Retrieve Athena query results
S3_BUCKET_NAME = "anelpere-duke-datathon-2024"
S3_OUTPUT_DIRECTORY = "team2/athena-output"
temp_file_location: str = "athena_10_text_results.csv"

s3_client.download_file(
    S3_BUCKET_NAME,
    f"{S3_OUTPUT_DIRECTORY}/{response['QueryExecutionId']}.csv",
    temp_file_location,
)
df = pd.read_csv(temp_file_location)

In [None]:
#Format output data for analysis by Comprehend
upload_bucket = "s3://anelpere-duke-datathon-2024/team2/formatted/"

df = pd.read_csv('athena_10_text_results.csv')
df = df.drop('category', axis=1)
df.to_csv('athena_10_text_formatted.csv', index=False)

#spliting rows into seperate files
with open('athena_10_text_formatted.csv') as infile, open('output/output1.csv', 'w') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    header = next(reader)
    writer.writerow(header)
    
    row = next(reader)
    writer.writerow(row)

    for i, row in enumerate(reader):
        with open(f'output/output{i+2}.csv', 'w') as outfile:
            reader = csv.reader(infile) # reopen file
            writer = csv.writer(outfile)
            writer.writerow(header)
            writer.writerow(row)


In [None]:
#Upload the files to S3
upload_bucket = "s3://anelpere-duke-datathon-2024/team2/formatted/"
upload_bucket_prefix = "team2/formatted/.ipynb_checkpoints/"
local_dir = "output/"

if os.environ.get('LC_CTYPE', '') == 'UTF-8':
    os.environ['LC_CTYPE'] = 'en_US.UTF-8'

from awscli.clidriver import create_clidriver
driver = create_clidriver()
driver.main('s3 sync output/    s3://anelpere-duke-datathon-2024/team2/formatted/'.split())

#delete the checkpoint files
keys = s3_client.list_objects_v2(Bucket=S3_BUCKET_NAME, Prefix=upload_bucket_prefix).get('Contents', [])  
keys = [obj['Key'] for obj in keys]

s3_client.delete_objects(Bucket=S3_BUCKET_NAME, Delete={'Objects': [{'Key': key} for key in keys]})

In [None]:
#Comprehend medical detection
comprehendmedical = boto3.client('comprehendmedical')

job = comprehendmedical.start_icd10_cm_inference_job(
    InputDataConfig={
        'S3Bucket': S3_BUCKET_NAME,
        'S3Key': 'team2/formatted'
    },
    OutputDataConfig={
        'S3Bucket': 'anelpere-duke-datathon-2024',
        'S3Key': 'team2/output'
    },
    DataAccessRoleArn='arn:aws:iam::610912512102:role/comprehend-medical-role',
    JobName='Mimic-test-job',
    LanguageCode='en'
)