In [20]:
#To remove all variables from the namespace
%reset -f

#Creating a log file to record the commands and outputs
%logstop
%logstart -t -o "E:/Python Clinical Course/CM log.txt"

Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/CM log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [21]:
import pandas as pd
import pyreadstat
import os
import numpy as np

In [22]:
raw_path = r"E:\Python Clinical Course\RAW"
sdtm_path = r"E:\Python Clinical Course\SDTM"

In [23]:
raw_datasets = {}

for file in os.listdir(raw_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(raw_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        raw_datasets[dataset_name] = df

In [24]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [25]:
# Load raw datasets (replace with actual paths or DataFrame reads)
cm1 = raw_datasets.get("cm")
cmp = raw_datasets.get("cmpd")
dm = sdtm_datasets.get("dm_")
se= sdtm_datasets.get("se")

In [26]:
def parse_partial_date(date_str, time_str=None):
    if pd.isna(date_str) or date_str in ["", "UNK-UNK-UNK"]:
        return ""
    
    #convert to string if not already
    date_str = str(date_str).strip()
    parts = date_str.split('-')
    y, m, d = parts + [''] * (3 - len(parts))
    y = '' if y == 'UNK' else y
    m = '' if m == 'UNK' else m
    d = '' if d == 'UNK' else d
    date = '-'.join(filter(None, [y, m, d]))
    if time_str and time_str != 'U':
        return f"{date}T{time_str}"
    return date


In [27]:

# Load datasets
cm1.rename(columns={
    "CMTRT": "CMTRTX", "CMINDC": "CMINDCX", "CMDOSE": "CMDOSEX",
    "CMDOSEU": "CMDOSEUX", "CMFREQ": "CMFREQX", "CMROUTE": "CMROUTEX"
}, inplace=True)
cm1["CMTRT"] = cm1["CMTRTX"]
cm1["CMINDC"] = cm1["CMINDCX"]
cm1["CMDOSE"] = cm1["CMDOSEX"]
cm1["CMDOSTXT"] = cm1["CMDOSEX"].astype(str).where(cm1["CMDOSE"].isna(), '')


In [29]:
cm1["CMSTDTC"] = cm1.apply(lambda row: parse_partial_date(row["CMSTDAT"], row["CMSTTIM"]), axis=1)
cm1["CMENDTC"] = cm1.apply(lambda row: parse_partial_date(row["CMENDAT"], row["CMENTIM"]), axis=1)
cm1["CMENRF"] = np.where(cm1["CMONGO"] == "X", "ONGOING", "")
cm1["CMDECOD"] = cm1["PREFERRED_NAME"]
cm1["CMCAT"] = "PRIOR AND CONCOMITANT MEDICATIONS"
cm1["CMDOSU"] = cm1["CMDOSEUX"]
cm1["CMDOSFRQ"] = cm1["CMFREQX"]
cm1["CMROUTE"] = cm1["CMROUTEX"]
cm1["STUDYID"] = "AA-2020"
cm1["DOMAIN"] = "CM"
cm1["SITENUM"] = cm1["SITENUM"].astype(str)
cm1["SITEID"] = cm1["SITENUM"].str[0:3]
cm1["SUBJID"] = cm1["SUBNUM"].str[3:]
cm1["USUBJID"] = "AA-2020-" + cm1["SITEID"].astype(str) + "-" + cm1["SUBJID"]


In [30]:
cmp.rename(columns={"CMTRT": "CMTRTX", "CMINDC": "CMINDCX"}, inplace=True)
cmp["CMTRT"] = cmp["CMTRTX"]
cmp["CMINDC"] = cmp["CMINDCX"]


In [None]:
cmp["CMSTDTC"] = cmp["CMSTDAT"].apply(lambda x: parse_partial_date(x))
cmp["CMENDTC"] = cmp["CMENDAT"].apply(lambda x: parse_partial_date(x))
cmp["CMENRF"] = np.where(cmp["CMONGO"] == "X", "ONGOING", "")
cmp["STUDYID"] = "AA-2020"
cmp["DOMAIN"] = "CM"
cmp["SITENUM"] = cmp["SITENUM"].astype(str)
cmp["SITEID"] = cmp["SITENUM"].str[0:3]@
cmp["SUBJID"] = cmp["SUBNUM"].str[3:]
cmp["USUBJID"] = "AA-2020-" + cmp["SITEID"].astype(str) + "-" + cmp["SUBJID"]

cm_all = pd.concat([cm1, cmp], ignore_index=True)
cm_all = cm_all[cm_all["CMTRT"].notna()]


In [32]:

se["USUBJID"] = se["USUBJID"].apply(lambda x: x[:7] + "-" + x[11:14] + "-" + x[14:])
se_scrn = se[se["TAETORD"] == 1][["USUBJID", "SESTDTC", "SEENDTC"]].rename(columns={"SESTDTC": "SCRNST", "SEENDTC": "SCRNEND"})
se_trt = se[se["TAETORD"] == 2][["USUBJID", "SESTDTC", "SEENDTC"]].rename(columns={"SESTDTC": "CYCLE1ST", "SEENDTC": "CYCLE1END"})
se_fup = se[se["TAETORD"] == 3][["USUBJID", "SESTDTC", "SEENDTC"]].rename(columns={"SESTDTC": "LTFUPST", "SEENDTC": "LTFUPEND"})
cm_epoch = cm_all.merge(se_scrn, on="USUBJID", how="left").merge(se_trt, on="USUBJID", how="left").merge(se_fup, on="USUBJID", how="left")


In [33]:

def derive_epoch(row):
    date = row["CMSTDTC"][:10]
    for epoch, start, end in [
        ("FOLLOW-UP", "LTFUPST", "LTFUPEND"),
        ("TREATMENT", "CYCLE1ST", "CYCLE1END"),
        ("SCREENING", "SCRNST", "SCRNEND")
    ]:
        if pd.notna(row[start]) and pd.notna(row[end]):
            if row[start][:10] <= date <= row[end][:10]:
                return epoch
    return ""

cm_epoch["EPOCH"] = cm_epoch.apply(derive_epoch, axis=1)


In [34]:

dm["USUBJID"] = dm["USUBJID"].str.replace("AAA-2022", "AA-2020")
dm["RFSTDTC_N"] = pd.to_datetime(dm["RFSTDTC"], errors="coerce").dt.date

cm_epoch = cm_epoch.merge(dm[["USUBJID", "RFSTDTC", "RFSTDTC_N"]], on="USUBJID", how="left")
cm_epoch["CMSTDY"] = (pd.to_datetime(cm_epoch["CMSTDTC"].str[:10], errors='coerce') - pd.to_datetime(cm_epoch["RFSTDTC_N"], errors='coerce')).dt.days + 1
cm_epoch["CMENDY"] = (pd.to_datetime(cm_epoch["CMENDTC"].str[:10], errors='coerce') - pd.to_datetime(cm_epoch["RFSTDTC_N"], errors='coerce')).dt.days + 1

cm_epoch = cm_epoch.sort_values(by=["STUDYID", "USUBJID", "CMTRT", "CMSTDTC"])
cm_epoch["CMSEQ"] = cm_epoch.groupby("USUBJID").cumcount() + 1


In [35]:

columns = [
    "STUDYID", "DOMAIN", "USUBJID", "CMSEQ", "CMTRT", "CMDECOD", "CMCAT",
    "CMINDC", "CMDOSE", "CMDOSTXT", "CMDOSU", "CMDOSFRQ", "CMROUTE", "EPOCH",
    "CMSTDTC", "CMENDTC", "CMSTDY", "CMENDY", "CMENRF"
]
final_cm = cm_epoch[columns]


In [36]:
column_labels = {
'STUDYID': "Study Identifier",
'SUBJID': "Subject Identifier for the Study",
'SITEID': "Study Site Identifier",
'DOMAIN': "Domain Abbreviation",
'USUBJID': "Unique Subject Identifier",
'CMSEQ': "Sequence number",
'CMTRT': "Reported Name of Drug, Med or Therapy",
'CMDECOD': "Standardized Medicaion Name",
'CMCAT': "Category for Medication",
'CMINDC': "Indication",
'CMDOSE': "Dose per Administration",
'CMDOSTXT': "Dose Description",
'CMDOSU': "Dose Units",
'CMDOSFRQ': "Dosing Frequency per interval",
'CMROUTE': "Route of Administration",
'EPOCH': "Epoch",
'CMSTDTC': "Start Date/Time of Medication",
'CMENDTC': "End Date/Time of Medication",
'CMSTDY': "Study Day of Start of Medication",
'CMENDY': "End Date/Time of Medication",
'CMENRF': "End Relative to Reference period",
}

In [37]:
final_cm.attrs['column_labels'] = column_labels

In [38]:
for col in final_cm.columns:
    label = final_cm.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

STUDYID: Study Identifier
DOMAIN: Domain Abbreviation
USUBJID: Unique Subject Identifier
CMSEQ: Sequence number
CMTRT: Reported Name of Drug, Med or Therapy
CMDECOD: Standardized Medicaion Name
CMCAT: Category for Medication
CMINDC: Indication
CMDOSE: Dose per Administration
CMDOSTXT: Dose Description
CMDOSU: Dose Units
CMDOSFRQ: Dosing Frequency per interval
CMROUTE: Route of Administration
EPOCH: Epoch
CMSTDTC: Start Date/Time of Medication
CMENDTC: End Date/Time of Medication
CMSTDY: Study Day of Start of Medication
CMENDY: End Date/Time of Medication
CMENRF: End Relative to Reference period


In [39]:
output = "E:/Python Clinical Course/SDTM/SDTM CSV"
output_path = f"{output}/CM.csv"
final_cm.to_csv(output_path, index=False)

print(f"CM dataset created successfully with {len(final_cm)} subjects and {len(final_cm.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {final_cm.shape}")
print(f"Columns: {list(final_cm.columns)}")


CM dataset created successfully with 552 subjects and 19 variables.
Dataset saved to: E:/Python Clinical Course/SDTM/SDTM CSV/CM.csv

Dataset Info:
Shape: (552, 19)
Columns: ['STUDYID', 'DOMAIN', 'USUBJID', 'CMSEQ', 'CMTRT', 'CMDECOD', 'CMCAT', 'CMINDC', 'CMDOSE', 'CMDOSTXT', 'CMDOSU', 'CMDOSFRQ', 'CMROUTE', 'EPOCH', 'CMSTDTC', 'CMENDTC', 'CMSTDY', 'CMENDY', 'CMENRF']
