# explore_clinicaldata

# What is a "concept ID"?

In the AIREADI dataset, OMOP concept IDs are assigned to various variables such as HbA1c, visual acuity, and answers to questionnaires for standardization purposes. For more details, see the OMOP Clinical Data Structure documentation: https://docs.aireadi.org/docs/1/dataset/clinical-data/OMOP-Clinical-Data-Structure/. Finding concept IDs is essential when searching for specific values.

To explore which values are included, you can refer to the OMOP Mapping Table for clinical data (https://docs.aireadi.org/v1-omopTable) or the Data Domain Table for clinical lab tests (https://docs.aireadi.org/v1-dataDomainTable).

# How to find concept IDs?

To identify concept IDs, you can use files such as condition_occurrence.csv, measurement.csv, and observation.csv within the clinical_data section of the dataset.     

The following examples show how to find and use concept IDs to filter the data.

In [None]:
import os
import stat

from tqdm import tqdm

import time
import pandas as pd

In [None]:
# custom path -- change to match your file structure
# data_root =  r'path/to/your/aireadi release/'  # change this to your own path - use this form if your path contains spaces
data_root = "/Volumes/data/datasets/AIREADI/YEAR2/" # change to your path

In [None]:
# Load clinical data from TSV and CSV files

# To avoid warnings "DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.", some
# data types are specified.
participants_df = pd.read_csv(data_root + 'participants.tsv', sep='\t')
measurement_df = pd.read_csv(os.path.join(data_root, "clinical_data", "measurement.csv"), low_memory=False)  # DtypeWarning
condition_occurrence_df = pd.read_csv(os.path.join(data_root, "clinical_data", "condition_occurrence.csv"))
observation_df = pd.read_csv(os.path.join(data_root, "clinical_data", "observation.csv"))


In [None]:
# in year2 data, the id_field was called participant_id
# in year3 data, the id_field was changed to person_id to make table joins easier
# this notebook will convert to use "person_id" for the id_field
id_field = "person_id"
participants_df = participants_df.rename(columns={'participant_id': id_field})

print(f"In this notebook, the id_field is: {id_field}")

In [None]:
# Find concept IDs in measurement.csv
measurement_unique_values = measurement_df['measurement_source_value'].unique()
measurement_sorted_list = sorted(measurement_unique_values)
for value in measurement_sorted_list:
    concept_id = measurement_df['measurement_concept_id'].get(measurement_df['measurement_source_value']==value).iloc[0]
    print(f"'{value}': {concept_id},")

In [None]:
# Find concept IDs in condition_occurrence.csv and observation.csv
if "condition_concept_id" in condition_occurrence_df.columns:
    condition_matches = condition_occurrence_df["condition_concept_id"].isin(observation_df["qualifier_concept_id"])
    matching_observations = observation_df[observation_df["qualifier_concept_id"].isin(condition_occurrence_df["condition_concept_id"])]

    used_rows = observation_df["qualifier_concept_id"].isin(matching_observations["qualifier_concept_id"])
    observation_remaining = observation_df[~used_rows]


In [None]:
condition_unique_values = matching_observations['observation_source_value'].unique()
condition_sorted_list = sorted(condition_unique_values)
for value in condition_sorted_list:
    concept_id = matching_observations['qualifier_concept_id'].get(matching_observations['observation_source_value']==value).iloc[0]
    print(f"'{value}': {concept_id},")

In [None]:
observation_unique_values = observation_remaining['observation_source_value'].unique()
observation_sorted_list = sorted(observation_unique_values)
for value in observation_sorted_list:
    concept_id = observation_remaining['observation_concept_id'].get(observation_remaining['observation_source_value']==value).iloc[0]
    print(f"'{value}': {concept_id},")

# How to make csv files for analyzing

### Example of making a csv file by using variables listed in measurement.csv

In [None]:
# concept_ids lets you build a dictionary that maps concept names to their IDs
# "measurement_concept_id" should be used as the concept ID
concept_ids = {

'bmi_vsorres, BMI': 4245997,
'viaodplog, VA Letter Score - Photopic VA - OD': 2005200042,
'viaosplog, VA Letter Score - Photopic VA - OS': 2005200043,
}

def parse_measurement(df, concept_id):
    temp_df = df[df['measurement_concept_id'] == concept_id]
    return temp_df


final_df = pd.DataFrame(columns=[id_field, 'age', 'study_group', 'clinical_site'])

final_df[id_field] = participants_df[id_field]
final_df['age'] = participants_df['age']
final_df['study_group'] = participants_df['study_group']
final_df['clinical_site'] = participants_df['clinical_site']

for key, value in concept_ids.items():
    temp_df = parse_measurement(measurement_df, value)
    temp_df = temp_df.rename(columns={'value_as_number': key})
    temp_df = temp_df.rename(columns={'participant_id': id_field})
    temp_df = temp_df[[id_field, key]]

    final_df = pd.merge(final_df, temp_df, on=id_field, how='left')
 

final_df

### Example of making a csv file by using variables listed in observation.csv (and condition_occurance.csv)
Variables in condition_occurance.csv are overlapped with observation.csv.

In [None]:
# concept_ids lets you build a dictionary that maps concept names to their IDs
# "qualifier_concept_id" should be used as the concept ID
concept_ids = {
    'mhoccur_ms, Multiple sclerosis': 374919,
    'mhoccur_oa, Osteoporosis': 80502,
    'mhoccur_obs, Obesity': 433736  
}


def parse_qualifier(df, concept_id):
    temp_df = df[df["qualifier_concept_id"] == concept_id]
    return temp_df

final_df = pd.DataFrame(columns=[id_field, 'age', 'study_group', 'clinical_site'])

final_df[id_field] = participants_df[id_field]
final_df['age'] = participants_df['age']
final_df['study_group'] = participants_df['study_group']
final_df['clinical_site'] = participants_df['clinical_site']

for key, value in concept_ids.items():
    temp_df = parse_qualifier(observation_df, value)
    temp_df = temp_df.rename(columns={'value_as_number': key})
    temp_df = temp_df.rename(columns={'participant_id': id_field})
    temp_df = temp_df[[id_field, key]]

    final_df = pd.merge(final_df, temp_df, on=id_field, how='left')
 
final_df

In [None]:
# concept_ids lets you build a dictionary that maps concept names to their respective concept IDs
# 'observation_concept_id' field has the concept IDs
concept_ids = {

'paidscore, PAID score': 2005200049,
'years_of_education': 42528764  
}


def parse_observation(df, concept_id):
    temp_df = df[df['observation_concept_id'] == concept_id]
    return temp_df

final_df = pd.DataFrame(columns=[id_field, 'age', 'study_group', 'clinical_site'])

final_df[id_field] = participants_df[id_field]
final_df['age'] = participants_df['age']
final_df['study_group'] = participants_df['study_group']
final_df['clinical_site'] = participants_df['clinical_site']

for key, value in concept_ids.items():
    temp_df = parse_observation(observation_df, value)
    temp_df = temp_df.rename(columns={'value_as_number': key})
    temp_df = temp_df.rename(columns={'participant_id': id_field})
    temp_df = temp_df[[id_field, key]]

    final_df = pd.merge(final_df, temp_df, on=id_field, how='left')
 
final_df


In [None]:
print('done')