In [1]:
#To remove all variables from the namespace
%reset -f

#Creating a log file to record the commands and outputs
%logstop
%logstart -t -o "E:/Python Clinical Course/ADMH log.txt"

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/ADMH log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [2]:
import pandas as pd
import numpy as np
import os
import pyreadstat
from datetime import datetime
from pathlib import Path

# Define paths (adjust these to your actual paths)
adam_path = r"E:\Python Clinical Course\ADAM datasets\ADaM Datasets"
sdtm_path = r"E:\Python Clinical Course\SDTM"
raw_path = r"E:\Python Clinical Course\RAW"


In [4]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [5]:
# 1) Read the main SDTM dataset
mh1 = sdtm_datasets.get("mh")
suppmh = sdtm_datasets.get("suppmh")
mh1 = mh1.drop(columns=['ARM', 'ACTARM'], errors='ignore')
mh1 = mh1.sort_values(['USUBJID', 'MHSEQ']).reset_index(drop=True)

In [6]:

    
# Convert IDVARVAL to numeric for MHSEQ
suppmh['MHSEQ'] = pd.to_numeric(suppmh['IDVARVAL'], errors='coerce')
    
# Sort by USUBJID and MHSEQ
suppmh = suppmh.sort_values(['USUBJID', 'MHSEQ']).reset_index(drop=True)
    
# Transpose SUPPMH data
suppmh_trans = suppmh.pivot_table(
        index=['USUBJID', 'MHSEQ'], 
        columns='QNAM', 
        values='QVAL', 
        aggfunc='first'
    ).reset_index()


In [7]:

# 3) Merge MH with transposed SUPPMH
mh2 = pd.merge(mh1, suppmh_trans, on=['USUBJID', 'MHSEQ'], how='left')


In [8]:
adam_datasets = {}

for file in os.listdir(adam_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(adam_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        adam_datasets[dataset_name] = df

In [9]:
# 4) Merge with ADSL
adsl = adam_datasets.get("adsl")
mh3 = pd.merge(mh2, adsl, on='USUBJID', how='inner')


In [10]:
# Create MHPRIOR variable based on MHENRF
def create_mhprior(mhenrf):
    if pd.isna(mhenrf):
        return ''
    elif mhenrf == 'BEFORE':
        return 'Past'
    elif mhenrf == 'ONGOING':
        return 'Current'
    else:
        return ''

mh3['MHPRIOR'] = mh3['MHENRF'].apply(create_mhprior)


In [11]:

# 6) Select and keep specific columns
columns_to_keep = [
    'STUDYID',
    'USUBJID', 
    'SUBJID',
    'SITEID',
    'AGE',
    'AGEU',
    'SEX',
    'RACE',
    'ETHNIC',
    'COUNTRY',
    'SAFFL',
    'TRT01P',
    'TRT01PN',
    'TRT01A',
    'TRT01AN',
    'TRTSDT',
    'TRTEDT',
    'MHSEQ',
    'MHCAT',
    'MHREL',
    'MHPRIOR',
    'MHTERM',
    'MHDECOD',
    'MHBODSYS',
    'MHSTDTC',
    'MHSTDY',
    'MHENDTC',
    'MHENDY',
    'MHENRF',
    'MHONGO'
]


In [12]:

# Keep only columns that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in mh3.columns]


In [13]:

mh4 = mh3[existing_columns].copy()

# Column labels dictionary for documentation
column_labels = {
    'STUDYID': 'Study Identifier',
    'USUBJID': 'Unique Subject Identifier',
    'SUBJID': 'Subject Identifier for the Study',
    'SITEID': 'Study Site Identifier',
    'AGE': 'Age',
    'AGEU': 'Age Units',
    'SEX': 'Sex',
    'RACE': 'Race',
    'ETHNIC': 'Ethnicity',
    'COUNTRY': 'Country',
    'SAFFL': 'Safety Population Flag',
    'TRT01P': 'Planned Treatment for Period 01',
    'TRT01PN': 'Planned Treatment for Period 01 (N)',
    'TRT01A': 'Actual Treatment for Period 01',
    'TRT01AN': 'Actual Treatment for Period 01 (N)',
    'TRTSDT': 'Date of First Exposure to Treatment',
    'TRTEDT': 'Date of Last Exposure to Treatment',
    'MHSEQ': 'Sequence Number',
    'MHCAT': 'Category for Medical History',
    'MHREL': 'Is the condition related to COVID-19?',
    'MHPRIOR': 'Past/Current Event',
    'MHTERM': 'Reported Term for the Medical History',
    'MHDECOD': 'Dictionary-Derived Term',
    'MHBODSYS': 'Body System or Organ Class',
    'MHSTDTC': 'Start Date/Time of Medical History Event',
    'MHSTDY': 'Study Day of Start of Observation',
    'MHENDTC': 'End Date/Time of Medical History Event',
    'MHENDY': 'Study Day of End of Observation',
    'MHENRF': 'End Relative to Reference Period',
    'MHONGO': 'Ongoing?'
}

# Create the final ADMH dataset
admh = mh4.copy()


In [14]:

# Add dataset label as an attribute (for documentation)
admh.attrs['label'] = 'Medical History Analysis Dataset'
admh.attrs['column_labels'] = column_labels


In [15]:
for col in admh.columns:
    label = admh.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

USUBJID: Unique Subject Identifier
SUBJID: Subject Identifier for the Study
SITEID: Study Site Identifier
AGE: Age
AGEU: Age Units
SEX: Sex
RACE: Race
ETHNIC: Ethnicity
COUNTRY: Country
SAFFL: Safety Population Flag
TRT01P: Planned Treatment for Period 01
TRT01PN: Planned Treatment for Period 01 (N)
TRT01A: Actual Treatment for Period 01
TRT01AN: Actual Treatment for Period 01 (N)
TRTSDT: Date of First Exposure to Treatment
TRTEDT: Date of Last Exposure to Treatment
MHSEQ: Sequence Number
MHCAT: Category for Medical History
MHREL: Is the condition related to COVID-19?
MHPRIOR: Past/Current Event
MHTERM: Reported Term for the Medical History
MHDECOD: Dictionary-Derived Term
MHBODSYS: Body System or Organ Class
MHSTDTC: Start Date/Time of Medical History Event
MHSTDY: Study Day of Start of Observation
MHENDTC: End Date/Time of Medical History Event
MHENDY: Study Day of End of Observation
MHENRF: End Relative to Reference Period
MHONGO: Ongoing?


In [16]:

#  Save the final dataset
output = "E:/Python Clinical Course/ADAM datasets"
output_path = f"{output}/ADMH.csv"
admh.to_csv(output_path, index=False)

print(f"ADMH dataset created successfully with {len(admh)} subjects and {len(admh.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {admh.shape}")
print(f"Columns: {list(admh.columns)}")


ADMH dataset created successfully with 121 subjects and 29 variables.
Dataset saved to: E:/Python Clinical Course/ADAM datasets/ADMH.csv

Dataset Info:
Shape: (121, 29)
Columns: ['USUBJID', 'SUBJID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC', 'COUNTRY', 'SAFFL', 'TRT01P', 'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTSDT', 'TRTEDT', 'MHSEQ', 'MHCAT', 'MHREL', 'MHPRIOR', 'MHTERM', 'MHDECOD', 'MHBODSYS', 'MHSTDTC', 'MHSTDY', 'MHENDTC', 'MHENDY', 'MHENRF', 'MHONGO']
