In [7]:
#To remove all variables from the namespace
%reset -f

#Creating a log file to record the commands and outputs
%logstop
%logstart -t -o "E:/Python Clinical Course/DM log.txt"

Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/DM log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [8]:
import pandas as pd
import pyreadstat
import os

In [9]:
raw_path = r"E:\Python Clinical Course\RAW"
sdtm_path = r"E:\Python Clinical Course\SDTM"

In [10]:
raw_datasets = {}

for file in os.listdir(raw_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(raw_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        raw_datasets[dataset_name] = df

In [11]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [12]:
# Helper functions
def iso_date(date):
    return pd.to_datetime(date, errors='coerce').dt.strftime('%Y-%m-%d')

def iso_datetime(date, time):
    dt = pd.to_datetime(date, errors='coerce')
    tm = pd.to_datetime(time, format='%H:%M:%S', errors='coerce').dt.time
    return [f"{d.date()}T{t}" if pd.notnull(d) and pd.notnull(t) else None for d, t in zip(dt, tm)]


In [13]:

# Load raw datasets (replace with actual paths or DataFrame reads)
raw_dm = raw_datasets.get("dm")
raw_ic = raw_datasets.get("ic")
raw_ds = raw_datasets.get("ds")
raw_ex = raw_datasets.get("ex")
raw_trt = raw_datasets.get("dummy_rnd")
raw_scf= raw_datasets.get("dat_sub")

In [24]:

# === DM1 dataset ===
dm1 = raw_dm.rename(columns={'AGE': 'AGEX', 'SEX': 'SEXX', 'RACE': 'RACEX', 'ETHNIC': 'ETHNICX'})
dm1['STUDYID'] = 'AAA-2022'
dm1['DOMAIN'] = 'DM'
dm1['SITENUM'] = dm1['SITENUM'].astype(str)
dm1['SITEID'] = dm1['SITENUM'].str[0:3]  # Assuming SITEID is the first three characters of SITENUM
dm1['SUBJID'] = dm1['SUBNUM'].str[3:]
dm1['USUBJID'] = dm1['STUDYID'] + '-' + dm1['SITEID'].astype(str) + '-' + dm1['SUBJID']
dm1['BRTHDTC'] = iso_date(dm1['BRTHDAT'])
dm1['AGE'] = dm1['AGEX']
dm1['AGEU'] = 'YEARS'
dm1['SEX'] = dm1['SEXX']
dm1['RACE'] = dm1['RACEX']

eth_map = {
    'HISP': 'HISPANIC OR LATINO',
    'NHISP': 'NOT HISPANIC OR LATINO',
    'U': 'UNKNOWN'
}
dm1['ETHNIC'] = dm1['ETHNICX'].map(eth_map).fillna('DECLINED TO ANSWER')

dm1 = dm1[['STUDYID', 'DOMAIN', 'USUBJID', 'SUBJID', 'SITEID', 'BRTHDTC', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC']]
dm1 = dm1.drop_duplicates(subset='USUBJID')


In [25]:

# === Informed Consent (IC) ===
ic = raw_ic.copy()
ic['STUDYID'] = 'AAA-2022'
ic['DOMAIN'] = 'DM'
ic['SITENUM'] = ic['SITENUM'].astype(str)
ic['SITEID'] = ic['SITENUM'].str[0:3]  # Assuming SITEID is the first three characters of SITENUM
ic['SUBJID'] = ic['SUBNUM'].str[3:]
ic['USUBJID'] = ic['STUDYID'] + '-' + ic['SITEID'].astype(str) + '-' + ic['SUBJID']
ic['RFICDTC'] = iso_date(ic['ICDAT'])
ic = ic[['USUBJID', 'RFICDTC']].drop_duplicates()


In [26]:

# === Death Info (DS) ===
ds = raw_ds.copy()
ds['STUDYID'] = 'AAA-2022'
ds['DOMAIN'] = 'DM'
ds['SITENUM'] = ds['SITENUM'].astype(str)
ds['SITEID'] = ds['SITENUM'].str[0:3]  # Assuming SITEID is the first three characters of SITENUM
ds['SUBJID'] = ds['SUBNUM'].str[3:]
ds['USUBJID'] = ds['STUDYID'] + '-' + ds['SITEID'].astype(str) + '-' + ds['SUBJID']

ds['DTHDTC'] = iso_date(ds['DSDTHDAT'])
ds['DTHFL'] = ds['DTHDTC'].apply(lambda x: 'Y' if pd.notnull(x) else None)
ds['RFPENDTC'] = iso_date(ds['DSLVDAT'])
ds = ds[['USUBJID', 'DTHDTC', 'DTHFL', 'RFPENDTC']].drop_duplicates()


In [27]:

# === Exposure (EX) ===
ex = raw_ex.copy()
ex['STUDYID'] = 'AAA-2022'
ex['DOMAIN'] = 'DM'
ex['SITENUM'] = ex['SITENUM'].astype(str)
ex['SITEID'] = ex['SITENUM'].str[0:3]  # Assuming SITEID is the first three characters of SITENUM
ex['SUBJID'] = ex['SUBNUM'].str[3:]
ex['USUBJID'] = ex['STUDYID'] + '-' + ex['SITEID'].astype(str) + '-' + ex['SUBJID']

ex['RFXSTDTC'] = iso_datetime(ex['EXSTDAT'], ex['EXSTTIM'])
ex['RFSTDTC'] = ex['RFXSTDTC']
ex['RFXENDTC'] = ex['RFXSTDTC']
ex['RFENDTC'] = ex['RFXSTDTC']
ex = ex[['USUBJID', 'RFXSTDTC', 'RFSTDTC', 'RFXENDTC', 'RFENDTC']].drop_duplicates()


In [22]:

# === Treatment Assignment ===
trt = raw_trt.copy()
trt['STUDYID'] = 'AAA-2022'
trt['DOMAIN'] = 'DM'
trt['SITEID'] = trt['USUBJID'].str[11:14]
trt['SUBJID'] = trt['USUBJID'].str[14:]
trt['USUBJID'] = trt['STUDYID'] + '-' + trt['SITEID'] + '-' + trt['SUBJID']

trt['ARMCD'] = trt['trtcd'].str.upper()
trt['ARM'] = trt['trtcd'].str.upper()
trt['ACTARMCD'] = trt['trtcd'].str.upper()
trt['ACTARM'] = trt['trtcd'].str.upper()
trt = trt[['USUBJID', 'ARMCD', 'ARM', 'ACTARMCD', 'ACTARM']].drop_duplicates()


In [28]:

# === Screen Failure (SCF) ===
scf = raw_scf.copy()
scf['STUDYID'] = 'AAA-2022'
scf['DOMAIN'] = 'DM'
scf['SITENUM'] = scf['SITENUM'].astype(str)
scf['SITEID'] = scf['SITENUM'].str[0:3]  # Assuming SITEID is the first three characters of SITENUM
scf['SUBJID'] = scf['SUBNUM'].str[3:]
scf['USUBJID'] = scf['STUDYID'] + '-' + scf['SITEID'].astype(str) + '-' + scf['SUBJID']

scf.loc[scf['STATUSID'] == 15, 'ARMNRS'] = 'SCREEN FAILURE'
scf.loc[scf['STATUSID'] == 15, 'ACTARMUD'] = 'SCREEN FAILURE'
scf = scf[['USUBJID', 'ARMNRS', 'ACTARMUD']].drop_duplicates()


In [29]:

# === Merge All ===
final = dm1.merge(ic, on='USUBJID', how='left') \
           .merge(ds, on='USUBJID', how='left') \
           .merge(ex, on='USUBJID', how='left') \
           .merge(trt, on='USUBJID', how='left') \
           .merge(scf, on='USUBJID', how='left')

final['COUNTRY'] = 'USA'


In [30]:

# === Column Order ===
columns_order = [
    'STUDYID', 'DOMAIN', 'USUBJID', 'SUBJID', 'RFSTDTC', 'RFENDTC',
    'RFXSTDTC', 'RFXENDTC', 'RFICDTC', 'RFPENDTC', 'DTHDTC', 'DTHFL',
    'SITEID', 'BRTHDTC', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC',
    'ARMCD', 'ARM', 'ACTARMCD', 'ACTARM', 'ARMNRS', 'ACTARMUD', 'COUNTRY'
]
final = final[columns_order]


In [32]:
column_labels = {
'STUDYID': "Study Identifier",
'DOMAIN': "Domain Abbreviation",
'USUBJID': "Unique Subject Identifier",
'SUBJID': "Subject Identifier for the Study",
'SITEID': "Study Site Identifier",
'RFSTDTC': "Subject Reference Start Date/Time",
'RFENDTC': "Subject Reference End Date/Time",
'RFXSTDTC': "Date/Time of First Study Treatment",
'RFXENDTC': "Date/Time of Last Study Treatment",
'RFICDTC': "Date/Time of Informed Consent",
'RFPENDTC': "Date/Time of End of Participation",
'DTHDTC': "Date/Time of Death",
'DTHFL': "Subject Death Flag",
'BRTHDTC': "Date/Time of Birth",
'AGE': "Age",
'AGEU': "Age Units",
'SEX': "Sex",
'RACE': "Race",
'ETHNIC': "Ethnicity",
'ARMCD': "Planned Arm Code",
'ARM': "Description of Planned Arm",
'ACTARMCD': "Actual Arm Code",
'ACTARM': "Description of Actual Arm",
'ARMNRS': "Reason Arm and/or Actual Arm is Null",
'ACTARMUD': "Description of Unplanned Actual Arm",
'COUNTRY': "Country",
}


In [33]:
final.attrs['column_labels'] = column_labels

In [32]:
for col in final.columns:
    label = final.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

STUDYID: Study Identifier
DOMAIN: Domain Abbreviation
USUBJID: Unique Subject Identifier
SUBJID: Subject Identifier for the Study
RFSTDTC: Subject Reference Start Date/Time
RFENDTC: Subject Reference End Date/Time
RFXSTDTC: Date/Time of First Study Treatment
RFXENDTC: Date/Time of Last Study Treatment
RFICDTC: Date/Time of Informed Consent
RFPENDTC: Date/Time of End of Participation
DTHDTC: Date/Time of Death
DTHFL: Subject Death Flag
SITEID: Study Site Identifier
BRTHDTC: Date/Time of Birth
AGE: Age
AGEU: Age Units
SEX: Sex
RACE: Race
ETHNIC: Ethnicity
ARMCD: Planned Arm Code
ARM: Description of Planned Arm
ACTARMCD: Actual Arm Code
ACTARM: Description of Actual Arm
ARMNRS: Reason Arm and/or Actual Arm is Null
ACTARMUD: Description of Unplanned Actual Arm
COUNTRY: Country


In [34]:
#  Save the final dataset
output = "E:/Python Clinical Course/SDTM/SDTM CSV"
output_path = f"{output}/DM.csv"
final.to_csv(output_path, index=False)

print(f"DM dataset created successfully with {len(final)} subjects and {len(final.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {final.shape}")
print(f"Columns: {list(final.columns)}")

DM dataset created successfully with 384 subjects and 26 variables.
Dataset saved to: E:/Python Clinical Course/SDTM/SDTM CSV/DM.csv

Dataset Info:
Shape: (384, 26)
Columns: ['STUDYID', 'DOMAIN', 'USUBJID', 'SUBJID', 'RFSTDTC', 'RFENDTC', 'RFXSTDTC', 'RFXENDTC', 'RFICDTC', 'RFPENDTC', 'DTHDTC', 'DTHFL', 'SITEID', 'BRTHDTC', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC', 'ARMCD', 'ARM', 'ACTARMCD', 'ACTARM', 'ARMNRS', 'ACTARMUD', 'COUNTRY']
