In [None]:
#To remove all variables from the namespace
%reset -f


In [7]:
#Creating a log file to record the commands and outputs
%logstop

In [None]:
%logstart -t -o "E:/Python Clinical Course/ADCM log.txt"

Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/ADCM log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [10]:
import pandas as pd
import numpy as np
import os
import pyreadstat
from datetime import datetime
from pathlib import Path

# Define paths
base_path = Path("E:/Python Clinical Course")
raw_path = base_path / "RAW"
sdtm_path = base_path / "SDTM"
adam_path = "E:/Python Clinical Course/ADAM datasets/ADaM Datasets"


In [11]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [12]:

# Load datasets
cm = sdtm_datasets.get("cm")
suppcm = sdtm_datasets.get("suppcm")


In [13]:
adam_datasets = {}

for file in os.listdir(adam_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(adam_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        adam_datasets[dataset_name] = df


In [14]:
# Load Adam datasets
adsl = adam_datasets.get("adsl")

In [15]:

# 1. Drop ARM, ACTARM
cm1 = cm.drop(columns=['ARM', 'ACTARM'], errors='ignore').copy()
cm1.sort_values(by=['USUBJID', 'CMSEQ'], inplace=True)


In [16]:

# 2. Handle SUPPCM
suppcm['CMSEQ'] = pd.to_numeric(suppcm['IDVARVAL'], errors='coerce')
suppcm.sort_values(by=['USUBJID', 'CMSEQ'], inplace=True)


In [17]:

# Pivot QNAM/QVAL to wide
suppcm_trans = suppcm.pivot_table(
    index=['USUBJID', 'CMSEQ'],
    columns='QNAM',
    values='QVAL',
    aggfunc='first'
).reset_index()

#Drop STUDYID from adsl
adsl = adsl.drop(columns=['STUDYID'], errors='ignore')

# 3. Merge CM and SUPPCM
cm2 = pd.merge(cm1, suppcm_trans, on=['USUBJID', 'CMSEQ'], how='left')
cm3 = pd.merge(cm2, adsl, on='USUBJID', how='inner')


In [18]:

# 5. Date parsing and cleaning
def parse_date(date_str):
    if pd.isna(date_str):
        return np.nan, '', ''
    dt = str(date_str)[:10]
    length = len(dt)
    if length == 4:
        return pd.to_datetime(f"{dt}-01-01", errors='coerce'), "M", f"{dt}-01-01"
    elif length == 7:
        return pd.to_datetime(f"{dt}-01", errors='coerce'), "D", f"{dt}-01"
    elif length == 10:
        return pd.to_datetime(dt, errors='coerce'), "", dt
    return np.nan, '', dt


cm3[['CMSTDTC_C', 'ASTDTF', 'ASTDT']] = cm3.apply(
    lambda row: pd.Series(parse_date(row.get('CMSTDTC'))), axis=1)

cm3[['CMENDTC_C', 'AENDTF', 'AENDT']] = cm3.apply(
    lambda row: pd.Series(parse_date(row.get('CMENDTC'))), axis=1)

cm3['ASTDT'] = pd.to_datetime(cm3['ASTDT'], errors='coerce')
cm3['AENDT'] = pd.to_datetime(cm3['AENDT'], errors='coerce')
cm3['TRTSDT'] = pd.to_datetime(cm3['TRTSDT'], errors='coerce')



In [19]:

# 6. ATC4 handling
cm3['ATC4'] = cm3.get('CODE4')
cm3['ATC4TXT'] = cm3.get('TEXT4')


In [20]:

# 7. Calculate ASTDY, AENDY, ONTRTFL
cm3['ASTDY'] = np.where(
    cm3['ASTDT'].notna() & cm3['TRTSDT'].notna(),
    np.where(cm3['ASTDT'] < cm3['TRTSDT'],
             (cm3['ASTDT'] - cm3['TRTSDT']).dt.days,
             (cm3['ASTDT'] - cm3['TRTSDT']).dt.days + 1),
    np.nan)

cm3['AENDY'] = np.where(
    cm3['AENDT'].notna() & cm3['TRTSDT'].notna(),
    np.where(cm3['AENDT'] < cm3['TRTSDT'],
             (cm3['AENDT'] - cm3['TRTSDT']).dt.days,
             (cm3['AENDT'] - cm3['TRTSDT']).dt.days + 1),
    np.nan)

cm3['ONTRTFL'] = np.where(
    (cm3['ASTDT'] >= cm3['TRTSDT']) |
    ((cm3['ASTDT'] < cm3['TRTSDT']) & (cm3['CMENRF'] == 'ONGOING')),
    'Y', '')


In [21]:

# 8. Keep final variables
keep_vars = [
    'STUDYID', 'USUBJID', 'SUBJID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE',
    'ETHNIC', 'COUNTRY', 'SAFFL', 'RANDFL', 'TRT01P', 'TRT01PN', 'TRT01A',
    'TRT01AN', 'TRTSDT', 'TRTEDT', 'CMSEQ', 'CMTRT', 'CMDECOD', 'CMINDC',
    'CMDOSE', 'CMDOSU', 'CMDOSFRQ', 'CMROUTE', 'CMSTDTC', 'CMENDTC',
    'CMENRF', 'CMINDSPE', 'ATC4', 'ATC4TXT', 'CMFRQOTH', 'CMONGO', 'CMDSEOTH',
    'CMAENUM', 'CMMHNUM', 'ASTDT', 'ASTDY', 'AENDT', 'AENDY', 'ONTRTFL'
]


In [22]:
existing_columns = [col for col in keep_vars if col in cm3.columns]

In [23]:
cm4 = cm3[existing_columns].copy()

In [24]:
column_labels = {
'STUDYID': "Study Identifier",
'USUBJID': "Unique Subject Identifier",
'SUBJID': "Subject Identifier for the Study",
'SITEID': "Study Site Identifier",
'AGE': "Age",
'AGEU': "Age Units",
'SEX': "Sex",
'RACE': "Race",
'ETHNIC': "Ethnicity",
'COUNTRY': "Country",
'SAFFL': "Safety Population Flag",
'RANDFL': "Randomized population Flag",
'TRT01P': "Planned Treatment for Period 01",
'TRT01PN': "Planned Treatment for Period 01 (N)",
'TRT01A': "Actual Treatment for Period 01",
'TRT01AN': "Actual Treatment for Period 01 (N)",
'TRTSDT': "Date of First Exposure to Treatment",
'TRTEDT': "Date of Last Exposure to Treatment",
'CMSEQ': "Sequence Number",
'CMTRT': "Reported Name of Drug, Med, or Therapy",
'CMDECOD': "Standardized Medication Name",
'CMINDC': "Indication",
'CMDOSE': "Dose per Administration",
'CMDOSU': "Dose Units",
'CMDOSFRQ': "Dosing Frequency per Interval",
'CMROUTE': "Route of Administration",
'CMSTDTC': "Start Date/Time of Medication",
'CMENDTC': "End Date/Time of Medication",
'CMENRF': "End Relative to Reference Period",
'CMINDSPE': "Indication Specify",
'ATC4': "ATC Level 4 Code",
'ATC4TXT': "ATC Level 4 Text",
'CMFRQOTH': "Other Frequency, Specify",
'CMONGO': "Ongoing",
'CMDSEOTH': "Other Dose Unit, Specify",
'CMAENUM': "Related to AE",
'CMMHNUM': "Related to MH",
'ASTDT': "Analysis Start Date",
'ASTDY': "Analysis Start Relative Day",
'AENDT': "Analysis End Date",
'AENDY': "Analysis End Relative Day",
'ONTRTFL': "On Treatment Record Flag"
}


In [25]:
cm4.attrs['column_labels'] = column_labels

In [26]:
for col in cm4.columns:
    label = cm4.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

STUDYID: Study Identifier
USUBJID: Unique Subject Identifier
SUBJID: Subject Identifier for the Study
SITEID: Study Site Identifier
AGE: Age
AGEU: Age Units
SEX: Sex
RACE: Race
ETHNIC: Ethnicity
COUNTRY: Country
SAFFL: Safety Population Flag
RANDFL: Randomized population Flag
TRT01P: Planned Treatment for Period 01
TRT01PN: Planned Treatment for Period 01 (N)
TRT01A: Actual Treatment for Period 01
TRT01AN: Actual Treatment for Period 01 (N)
TRTSDT: Date of First Exposure to Treatment
TRTEDT: Date of Last Exposure to Treatment
CMSEQ: Sequence Number
CMTRT: Reported Name of Drug, Med, or Therapy
CMDECOD: Standardized Medication Name
CMINDC: Indication
CMDOSE: Dose per Administration
CMDOSU: Dose Units
CMDOSFRQ: Dosing Frequency per Interval
CMROUTE: Route of Administration
CMSTDTC: Start Date/Time of Medication
CMENDTC: End Date/Time of Medication
CMENRF: End Relative to Reference Period
CMINDSPE: Indication Specify
ATC4: ATC Level 4 Code
ATC4TXT: ATC Level 4 Text
CMFRQOTH: Other Frequen

In [27]:

#  Save the final dataset
output = "E:/Python Clinical Course/ADAM datasets"
output_path = f"{output}/ADCM.csv"
cm4.to_csv(output_path, index=False)

print(f"ADCM dataset created successfully with {len(cm4)} subjects and {len(cm4.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {cm4.shape}")
print(f"Columns: {list(cm4.columns)}")


ADCM dataset created successfully with 384 subjects and 42 variables.
Dataset saved to: E:/Python Clinical Course/ADAM datasets/ADCM.csv

Dataset Info:
Shape: (384, 42)
Columns: ['STUDYID', 'USUBJID', 'SUBJID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC', 'COUNTRY', 'SAFFL', 'RANDFL', 'TRT01P', 'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTSDT', 'TRTEDT', 'CMSEQ', 'CMTRT', 'CMDECOD', 'CMINDC', 'CMDOSE', 'CMDOSU', 'CMDOSFRQ', 'CMROUTE', 'CMSTDTC', 'CMENDTC', 'CMENRF', 'CMINDSPE', 'ATC4', 'ATC4TXT', 'CMFRQOTH', 'CMONGO', 'CMDSEOTH', 'CMAENUM', 'CMMHNUM', 'ASTDT', 'ASTDY', 'AENDT', 'AENDY', 'ONTRTFL']
