In [1]:
#To remove all variables from the namespace
%reset -f


In [2]:
#Creating a log file to record the commands and outputs
%logstop


Logging hadn't been started.


In [3]:
%logstart -t -o "E:/Python Clinical Course/ADSL log.txt"

Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/ADSL log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import pyreadstat

# Set up file paths (equivalent to libname statements)
adam_path = "E:/Python Clinical Course/ADAM datasets"
sdtm_path = "E:/Python Clinical Course/SDTM"
raw_path = "E:/Python Clinical Course/RAW"

In [5]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df


In [6]:
#df, meta = pyreadstat.read_sas7bdat(f"{sdtm_path}/DM.sas7bdat")

In [7]:
dm1 = sdtm_datasets.get("dm")
suppdm = sdtm_datasets.get("suppdm")
vs = sdtm_datasets.get("vs")
ds = sdtm_datasets.get("ds")
sv = sdtm_datasets.get("sv")
face = sdtm_datasets.get("face")


In [8]:
# Transpose SUPPDM (equivalent to PROC TRANSPOSE)
suppdm_trans = suppdm.pivot_table(
    index='USUBJID', 
    columns='QNAM', 
    values='QVAL', 
    aggfunc='first'
).reset_index()

In [9]:
# Merge DM1 with transposed SUPPDM
dm2 = pd.merge(dm1, suppdm_trans, on='USUBJID', how='left')


In [10]:
dm3 = dm2.copy()

In [11]:
dm3['AGEGR1'] = np.where(
    dm3['AGE'].notna(),
    np.where(dm3['AGE'] < 40, '< 40 years old', '>= 40 years old'),
    None
)

In [12]:
# Sex numeric conversion
dm3['SEXN'] = dm3['SEX'].map({'F': 2, 'M': 1})


In [13]:

# Race numeric conversion
race_mapping = {
    "AMERICAN INDIAN OR ALASKA NATIVE": 1,
    "ASIAN": 2,
    "BLACK OR AFRICAN AMERICAN": 3,
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": 4,
    "WHITE": 5,
    "OTHER": 6,
    "NOT REPORTED": 7,
    "": 8,
    None: 8
}
dm3['RACEN'] = dm3['RACE'].map(race_mapping)


In [14]:

# Ethnicity numeric conversion
ethnic_mapping = {
    "HISPANIC OR LATINO": 1,
    "NOT HISPANIC OR LATINO": 2,
    "UNKNOWN": 3,
    "NOT REPORTED": 4
}
dm3['ETHNICN'] = dm3['ETHNIC'].map(ethnic_mapping)


In [15]:

# Treatment assignments (Planned)
dm3['TRT01P'] = np.where(dm3['ARMCD'] == 'TQ', 'Tafenoquine', 
                np.where(dm3['ARMCD'] == 'PLACEBO', 'Placebo', None))
dm3['TRT01PN'] = np.where(dm3['ARMCD'] == 'TQ', 1, 
                 np.where(dm3['ARMCD'] == 'PLACEBO', 2, None))

# Treatment assignments (Actual)
dm3['TRT01A'] = np.where(dm3['ACTARMCD'] == 'TQ', 'Tafenoquine', 
                np.where(dm3['ACTARMCD'] == 'PLACEBO', 'Placebo', None))
dm3['TRT01AN'] = np.where(dm3['ACTARMCD'] == 'TQ', 1, 
                 np.where(dm3['ACTARMCD'] == 'PLACEBO', 2, None))


In [16]:

# Date/time conversions
def parse_iso8601_datetime(date_str):
    """Parse ISO8601 datetime string"""
    if pd.isna(date_str) or date_str == '':
        return None
    try:
        return pd.to_datetime(date_str, format='%Y-%m-%dT%H:%M:%S')
    except:
        try:
            return pd.to_datetime(date_str)
        except:
            return None

def parse_date_only(date_str):
    """Parse date string to date only"""
    if pd.isna(date_str) or date_str == '':
        return None
    try:
        return pd.to_datetime(date_str[:10], format='%Y-%m-%d').date()
    except:
        return None

# Treatment start/end dates
dm3['TRTSDTM'] = dm3['RFXSTDTC'].apply(parse_iso8601_datetime)
dm3['TRTSDT'] = dm3['RFXSTDTC'].apply(parse_date_only)
dm3['TRTEDTM'] = dm3['RFXENDTC'].apply(parse_iso8601_datetime)
dm3['TRTEDT'] = dm3['RFXENDTC'].apply(parse_date_only)


In [17]:

# Treatment duration
dm3['TRTDURD'] = np.where(
    (dm3['TRTEDT'].notna()) & (dm3['TRTSDT'].notna()),
    (pd.to_datetime(dm3['TRTEDT']) - pd.to_datetime(dm3['TRTSDT'])).dt.days + 1,
    None
)


In [18]:

# Flags
dm3['SCRNFL'] = np.where(dm3['RFICDTC'].notna() & (dm3['RFICDTC'] != ''), 'Y', None)
dm3['SAFFL'] = np.where(dm3['RFXSTDTC'].notna() & (dm3['RFXSTDTC'] != ''), 'Y', None)


In [19]:

# 4) Process Vital Signs for BMI calculation
#vs = pd.read_csv(f"{sdtm_path}/VS.csv")
vs_filtered = vs[
    (vs['VSTESTCD'].isin(['HEIGHT', 'WEIGHT'])) & 
    (vs['VISIT'] == 'Screening/Day -4 to -1')
][['USUBJID', 'VSTESTCD', 'VSSTRESN']]


In [20]:

# Transpose vital signs
vs_trans = vs_filtered.pivot_table(
    index='USUBJID',
    columns='VSTESTCD', 
    values='VSSTRESN',
    aggfunc='first'
).reset_index()
if 'HEIGHT' in vs_trans.columns and 'WEIGHT' in vs_trans.columns:
    vs_trans['BBMISI'] = (vs_trans['WEIGHT'] / (vs_trans['HEIGHT'] ** 2)) * 10000
    vs_trans = vs_trans.rename(columns={'HEIGHT': 'BHGHTSI', 'WEIGHT': 'BWGHTSI'})


In [21]:

# 5) Process Disposition dataset
#ds = pd.read_csv(f"{sdtm_path}/DS.csv")

# End of study status
ds_eos = ds[
    (ds['DSCAT'].str.upper() == 'DISPOSITION EVENT') & 
    (ds['DSSCAT'] == 'END OF STUDY/EARLY TERMINATION')
].copy()

ds_eos['EOSSTT'] = np.where(
    ds_eos['DSDECOD'] == 'COMPLETED', 'Completed', 'Discontinued'
)
ds_eos['DCSREAS'] = ds_eos['DSDECOD']
ds_eos['DCSREASP'] = np.where(
    ds_eos['DSDECOD'] == 'OTHER', ds_eos['DSTERM'], None
)


In [22]:

# Screen failure
ds_sf = ds[
    (ds['DSCAT'].str.upper() == 'DISPOSITION EVENT') & 
    (ds['DSSCAT'] == 'SCREEN FAILURE')
].copy()
ds_sf['EOSSTT'] = 'Screen Failure'


In [23]:

# Combine disposition data
ds_combined = pd.concat([ds_eos, ds_sf], ignore_index=True)

# End of study date
ds_combined['EOSDT'] = ds_combined['DSSTDTC'].apply(parse_date_only)

ds_final = ds_combined[ds_combined['EOSSTT'].notna()][
    ['USUBJID', 'EOSSTT', 'EOSDT', 'DCSREAS', 'DCSREASP']
]


In [24]:

# 6) Process randomization data
rand = ds[
    (ds['DSDECOD'] == 'RANDOMIZED') & 
    (ds['DSSTDTC'].notna()) & 
    (ds['DSSTDTC'] != '')
].copy()


In [25]:

rand['RANDDT'] = rand['DSSTDTC'].apply(parse_date_only)
rand['RANDFL'] = 'Y'
rand['ITTFL'] = 'Y'
rand = rand[rand['RANDDT'].notna()][['USUBJID', 'RANDDT', 'RANDFL', 'ITTFL']]


In [26]:

# 7) Process Subject Visits
#sv = pd.read_csv(f"{sdtm_path}/SV.csv")
sv_visit15 = sv[sv['VISITNUM'] == 15][['USUBJID']].drop_duplicates()

# 8) Process FACE data for COVID flags
#face = pd.read_csv(f"{sdtm_path}/FACE.csv")

# COVID Day 14 flag
fa1 = face[
    (face['FAOBJ'].str.upper().isin(['COUGH', 'SHORTNESS OF BREATH (DIFFICULTY BREATHING)'])) &
    (face['VISITNUM'] <= 15)
].copy()
fa1['COVD14FL'] = 'Y'
fa1 = fa1[['USUBJID', 'COVD14FL']].drop_duplicates()

# COVID Day 28 flag
fa2 = face[
    (face['FAOBJ'].str.upper().isin(['COUGH', 'SHORTNESS OF BREATH (DIFFICULTY BREATHING)'])) &
    (face['VISITNUM'] <= 28)
].copy()
fa2['COVD28FL'] = 'Y'
fa2 = fa2[['USUBJID', 'COVD28FL']].drop_duplicates()


In [27]:

# 9) Merge all datasets
dm4 = dm3.merge(vs_trans, on='USUBJID', how='left')
dm4 = dm4.merge(ds_final, on='USUBJID', how='left')
dm4 = dm4.merge(rand, on='USUBJID', how='left')
dm4 = dm4.merge(sv_visit15, on='USUBJID', how='left', indicator='_visit15')
dm4 = dm4.merge(fa1, on='USUBJID', how='left')
dm4 = dm4.merge(fa2, on='USUBJID', how='left')

# Per-protocol flag logic
dm4['PPROTFL'] = np.where(
    (dm4['_visit15'] == 'both') & (dm4['ITTFL'] == 'Y'), 'Y', None
)

# Handle ongoing subjects
dm4['EOSSTT'] = np.where(
    (dm4['EOSSTT'].isna() | (dm4['EOSSTT'] == '')) & dm4['TRTSDT'].notna(),
    'Ongoing',
    dm4['EOSSTT']
)

# Fill missing ITTFL
dm4['ITTFL'] = dm4['ITTFL'].fillna('N')


In [28]:

# 10) Select final columns
final_columns = [
    'STUDYID', 'USUBJID', 'SUBJID', 'RFSTDTC', 'RFENDTC', 'RFXSTDTC', 'RFXENDTC',
    'RFICDTC', 'RFPENDTC', 'DTHDTC', 'DTHFL', 'SITEID', 'BRTHDTC', 'AGE', 'AGEU',
    'AGEGR1', 'SEX', 'SEXN', 'RACE', 'RACEN', 'ETHNIC', 'ETHNICN', 'ARMCD', 'ARM',
    'ACTARMCD', 'ACTARM', 'COUNTRY', 'RANDFL', 'RANDDT', 'SCRNFL', 'SAFFL', 'ITTFL',
    'PPROTFL', 'TRT01P', 'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTSDTM', 'TRTSDT',
    'TRTEDTM', 'TRTEDT', 'TRTDURD', 'EOSSTT', 'EOSDT', 'DCSREAS', 'DCSREASP',
    'BBMISI', 'BHGHTSI', 'BWGHTSI', 'MFUV', 'VCYN', 'VCNUM', 'COVD14FL', 'COVD28FL',
    'HOSPCOFL'
]


In [29]:

# Select only columns that exist in the dataframe
existing_columns = [col for col in final_columns if col in dm4.columns]


In [30]:
dm5 = dm4[existing_columns].copy()


In [31]:

# 11) Add column labels (as attributes or comments)
column_labels = {
    'STUDYID': "Study Identifier",
    'USUBJID': "Unique Subject Identifier",
    'SUBJID': "Subject Identifier for the Study",
    'RFSTDTC': "Subject Reference Start Date/Time",
    'RFENDTC': "Subject Reference End Date/Time",
    'RFXSTDTC': "Date/Time of First Study Treatment",
    'RFXENDTC': "Date/Time of Last Study Treatment",
    'RFICDTC': "Date/Time of Informed Consent",
    'RFPENDTC': "Date/Time of End of Participation",
    'DTHDTC': "Date/Time of Death",
    'DTHFL': "Subject Death Flag",
    'SITEID': "Study Site Identifier",
    'BRTHDTC': "Date/Time of Birth",
    'AGE': "Age",
    'AGEU': "Age Units",
    'AGEGR1': "Pooled Age Group 1",
    'SEX': "Sex",
    'SEXN': "Sex (N)",
    'RACE': "Race",
    'RACEN': "Race (N)",
    'ETHNIC': "Ethnicity",
    'ETHNICN': "Ethnicity (N)",
    'ARMCD': "Planned Arm Code",
    'ARM': "Description of Planned Arm",
    'ACTARMCD': "Actual Arm Code",
    'ACTARM': "Description of Actual Arm",
    'COUNTRY': "Country",
    'RANDFL': "Randomization Flag",
    'RANDDT': "Date of Randomization",
    'SCRNFL': "Screened Population Flag",
    'SAFFL': "Safety Population Flag",
    'ITTFL': "Intent-To-Treat Population Flag",
    'PPROTFL': "Per-protocol Population Flag",
    'TRT01P': "Planned Treatment for Period 01",
    'TRT01PN': "Planned Treatment for Period 01 (N)",
    'TRT01A': "Actual Treatment for Period 01",
    'TRT01AN': "Actual Treatment for Period 01 (N)",
    'TRTSDTM': "Datetime of First Exposure to Treatment",
    'TRTSDT': "Date of First Exposure to Treatment",
    'TRTEDTM': "Datetime of Last Exposure to Treatment",
    'TRTEDT': "Date of Last Exposure to Treatment",
    'TRTDURD': "Total Treatment Duration (minutes)",
    'EOSSTT': "End of Study Status",
    'EOSDT': "End of Study Date",
    'DCSREAS': "Reason for Discontinuation from Study",
    'DCSREASP': "Reason Spec for Discont from Study",
    'BBMISI': "Baseline BMI (kg/m2)",
    'BHGHTSI': "Baseline Height (cm)",
    'BWGHTSI': "Baseline Weight (kg)",
    'MFUV': "Medical Follow-up Visit",
    'VCYN': "Did the subject get a COVID-19 vaccine?",
    'VCNUM': "How many doses?",
    'COVD14FL': "Clinical recovery on Day 14 Flag",
    'COVD28FL': "Clinical recovery on Day 28 Flag",
    'HOSPCOFL': "Hospitalized due to COVID-19"
}


In [32]:

# Store labels as DataFrame attributes (optional)
dm5.attrs['column_labels'] = column_labels


In [33]:

# 12) Save the final dataset
output = "E:/Python Clinical Course/ADAM datasets"
output_path = f"{output}/ADSL.csv"
dm5.to_csv(output_path, index=False)

print(f"ADSL dataset created successfully with {len(dm5)} subjects and {len(dm5.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {dm5.shape}")
print(f"Columns: {list(dm5.columns)}")

ADSL dataset created successfully with 116 subjects and 54 variables.
Dataset saved to: E:/Python Clinical Course/ADAM datasets/ADSL.csv

Dataset Info:
Shape: (116, 54)
Columns: ['STUDYID', 'USUBJID', 'SUBJID', 'RFSTDTC', 'RFENDTC', 'RFXSTDTC', 'RFXENDTC', 'RFICDTC', 'RFPENDTC', 'DTHDTC', 'DTHFL', 'SITEID', 'BRTHDTC', 'AGE', 'AGEU', 'AGEGR1', 'SEX', 'SEXN', 'RACE', 'RACEN', 'ETHNIC', 'ETHNICN', 'ARMCD', 'ARM', 'ACTARMCD', 'ACTARM', 'COUNTRY', 'RANDFL', 'RANDDT', 'SCRNFL', 'SAFFL', 'ITTFL', 'PPROTFL', 'TRT01P', 'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTSDTM', 'TRTSDT', 'TRTEDTM', 'TRTEDT', 'TRTDURD', 'EOSSTT', 'EOSDT', 'DCSREAS', 'DCSREASP', 'BBMISI', 'BHGHTSI', 'BWGHTSI', 'MFUV', 'VCYN', 'VCNUM', 'COVD14FL', 'COVD28FL']
