In [48]:
#To remove all variables from the namespace
%reset -f

#Creating a log file to record the commands and outputs
%logstop
%logstart -t -o "E:/Python Clinical Course/MH log.txt"

Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/MH log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [49]:
import pandas as pd
import pyreadstat
import os
import numpy as np

In [50]:
raw_path = r"E:\Python Clinical Course\RAW"
sdtm_path = r"E:\Python Clinical Course\SDTM"

In [51]:
raw_datasets = {}

for file in os.listdir(raw_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(raw_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        raw_datasets[dataset_name] = df

In [52]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [None]:
# Load raw datasets (replace with actual paths or DataFrame reads)
mh_raw = raw_datasets.get("mh")
dm_raw = sdtm_datasets.get("dm_")

In [56]:
# Create MH1 dataset (equivalent to DATA MH1)
mh1 = mh_raw.copy()
mh1 = mh1.rename(columns={'MHTERM': 'MHTERMX'})

# Add derived variables
mh1['STUDYID'] = 'AAA-2022'
mh1['DOMAIN'] = 'MH'
mh1['SITEID'] = mh1['SITENUM'].astype(float).astype(int) # RIGHT(SITENUM,3)
mh1['SUBJID'] = mh1['SUBNUM'].astype(str).str[3:]  # SUBSTR(SUBNUM,4)


In [57]:

# Create USUBJID (STUDYID-SITEID-SUBJID)
mh1['USUBJID'] = (mh1['STUDYID'].astype(str) + "-" + 
                  mh1['SITEID'].astype(str) + "-" + 
                  mh1['SUBJID'].astype(str))

mh1['MHCAT'] = 'Medical History'
mh1['MHTERM'] = mh1['MHTERMX']

# Handle MHENRF based on MHONGO
mh1['MHENRF'] = np.where(mh1['MHONGO'] == 'X', 'ONGOING', '')


In [58]:

# Function to process date strings
def process_date(date_col):
    """Process date column by splitting on '-' and handling 'UNK' values"""
    if pd.isna(date_col) or date_col == '':
        return ''
    
    parts = str(date_col).split('-')
    processed_parts = []
    
    for part in parts:
        if part == 'UNK':
            processed_parts.append('')
        else:
            processed_parts.append(part)
    
    # Join non-empty parts with '-'
    return '-'.join([p for p in processed_parts if p != ''])

# Process MHSTDTC
mh1['MHSTDTC'] = mh1['MHSTDAT'].apply(process_date)

# Process MHENDTC
mh1['MHENDTC'] = mh1['MHENDAT'].apply(process_date)


In [59]:

# Add medical coding variables
mh1['MHDECOD'] = mh1['PT_TERM']
mh1['MHBODSYS'] = mh1['SOC_TERM']

# Sort by USUBJID
mh1 = mh1.sort_values('USUBJID').reset_index(drop=True)

# Read DM dataset for study day derivation
dm1 = dm_raw[['USUBJID', 'RFSTDTC']].copy()


In [60]:

# Convert RFSTDTC to datetime (assuming ISO8601 format)
dm1["RFSTDTC_N"] = pd.to_datetime(dm1["RFSTDTC"], errors="coerce").dt.date

# Merge MH1 with DM1
mh2 = pd.merge(mh1, dm1, on='USUBJID', how='inner')


In [90]:

# Convert date columns to datetime for study day calculation
def convert_or_keep(val):
    try:
        return pd.to_datetime(val, errors='raise').date()
    except Exception:
        return val

mh2['STN'] = mh2['MHSTDTC'].apply(convert_or_keep)
mh2['ENN'] = mh2['MHENDTC'].apply(convert_or_keep)

In [91]:

# Calculate study days
def calculate_study_day(event_date, ref_date):
    """Calculate study day based on event date and reference date"""
    if pd.isna(event_date) or pd.isna(ref_date):
        return np.nan
    
    diff = (event_date - ref_date).days
    if event_date < ref_date:
        return diff
    else:
        return diff + 1

mh2['MHSTDY'] = mh2.apply(lambda row: calculate_study_day(row['STN'], row['RFSTDTC_N']), axis=1)
mh2['MHENDY'] = mh2.apply(lambda row: calculate_study_day(row['ENN'], row['RFSTDTC_N']), axis=1)


In [79]:

# Create sequence numbers
mh2 = mh2.sort_values(['STUDYID', 'USUBJID', 'MHDECOD']).reset_index(drop=True)
mh2['MHSEQ'] = mh2.groupby('USUBJID').cumcount() + 1


In [80]:

# Create final dataset with retained variables
final_columns = [
    'STUDYID', 'DOMAIN', 'USUBJID', 'MHSEQ', 'MHTERM', 'MHDECOD', 
    'MHCAT', 'MHBODSYS', 'MHSTDTC', 'MHENDTC', 'MHENRF'
]

mh_final = mh2[final_columns].copy()


In [81]:
column_labels={
'STUDYID': "Study Identifier",
'DOMAIN': "Domain Abbreviation",
'USUBJID': "Unique Subject Identifier",
'MHSEQ': "Sequence Number",
'MHCAT': "Category for Medical History",
'MHTERM': "Reported Term for the Medical History",
'MHLLT': "Lowest Level Term",
'MHLLTCD': "Lowest Level Term Code",
'MHDECOD': "Dictionary-Derived Term",
'MHHLT': "High Level Term",
'MHHLTCD': "High Level Term Code",
'MHHLGT': "High Level Group Term",
'MHHLGTCD': "High Level Group Term Code",
'MHPTCD': "Preferred Term Code",
'MHBODSYS': "Body System or Organ Class",
'MHBDSYCD': "Body System or Organ Class Code",
'MHSOC': "Primary System Organ Class",
'MHSOCCD': "Primary System Organ Class Code",
'MHSTDTC': "Start Date/Time of Medical History Event",
'MHENDTC': "End Date/Time of Medical History Event",
'MHENRF': "End Relative to Reference Period",
'MHSTDY': "Study Day of Start of Medical History Event",
'MHENDY': "Study Day of End of Medical History Event",
'MHDUR': "Duration of Medical History Event",
}

In [82]:
mh_final.attrs['column_labels'] = column_labels

In [83]:
for col in mh_final.columns:
    label = mh_final.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

STUDYID: Study Identifier
DOMAIN: Domain Abbreviation
USUBJID: Unique Subject Identifier
MHSEQ: Sequence Number
MHTERM: Reported Term for the Medical History
MHDECOD: Dictionary-Derived Term
MHCAT: Category for Medical History
MHBODSYS: Body System or Organ Class
MHSTDTC: Start Date/Time of Medical History Event
MHENDTC: End Date/Time of Medical History Event
MHENRF: End Relative to Reference Period


In [84]:
output = "E:/Python Clinical Course/SDTM/SDTM CSV"
output_path = f"{output}/MH.csv"
mh_final.to_csv(output_path, index=False)

print(f"MH dataset created successfully with {len(mh_final)} subjects and {len(mh_final.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {mh_final.shape}")
print(f"Columns: {list(mh_final.columns)}")

MH dataset created successfully with 236 subjects and 11 variables.
Dataset saved to: E:/Python Clinical Course/SDTM/SDTM CSV/MH.csv

Dataset Info:
Shape: (236, 11)
Columns: ['STUDYID', 'DOMAIN', 'USUBJID', 'MHSEQ', 'MHTERM', 'MHDECOD', 'MHCAT', 'MHBODSYS', 'MHSTDTC', 'MHENDTC', 'MHENRF']


In [85]:
# Create SUPPMH dataset
smh_data = []

for idx, row in mh2.iterrows():
    # First supplemental record for MHREL
    smh_data.append({
        'STUDYID': row['STUDYID'],
        'RDOMAIN': 'MH',
        'USUBJID': row['USUBJID'],
        'IDVAR': 'MHSEQ',
        'IDVARVAL': row['MHSEQ'],
        'QORIG': 'CRF',
        'QEVAL': '',
        'QNAM': 'MHREL',
        'QLABEL': 'Is the condition related to COVID-19?',
        'QVAL': row.get('MHREL', '')
    })
    
    # Second supplemental record for MHONGO
    smh_data.append({
        'STUDYID': row['STUDYID'],
        'RDOMAIN': 'MH',
        'USUBJID': row['USUBJID'],
        'IDVAR': 'MHSEQ',
        'IDVARVAL': row['MHSEQ'],
        'QORIG': 'CRF',
        'QEVAL': '',
        'QNAM': 'MHONGO',
        'QLABEL': 'Ongoing',
        'QVAL': row.get('MHONGO', '')
    })

# Create SUPPMH dataframe
suppmh = pd.DataFrame(smh_data)


In [86]:

output = "E:/Python Clinical Course/SDTM/SDTM CSV"
output_path = f"{output}/SUPPMH.csv"
suppmh.to_csv(output_path, index=False)

print(f"SUPPMH dataset created successfully with {len(suppmh)} subjects and {len(suppmh.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {suppmh.shape}")
print(f"Columns: {list(suppmh.columns)}")

SUPPMH dataset created successfully with 472 subjects and 10 variables.
Dataset saved to: E:/Python Clinical Course/SDTM/SDTM CSV/SUPPMH.csv

Dataset Info:
Shape: (472, 10)
Columns: ['STUDYID', 'RDOMAIN', 'USUBJID', 'IDVAR', 'IDVARVAL', 'QORIG', 'QEVAL', 'QNAM', 'QLABEL', 'QVAL']
