In [1]:
#To remove all variables from the namespace
%reset -f

#Creating a log file to record the commands and outputs
%logstop
%logstart -t -o "E:/Python Clinical Course/ADTTE log.txt"

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : E:/Python Clinical Course/ADTTE log.txt
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active


In [2]:
import pandas as pd
import numpy as np
import os
import pyreadstat
from datetime import datetime
from pathlib import Path

# Define paths (adjust these to your actual paths)
adam_path = r"E:\Python Clinical Course\ADAM datasets\ADaM Datasets"
sdtm_path = r"E:\Python Clinical Course\SDTM"
raw_path = r"E:\Python Clinical Course\RAW"

In [3]:
sdtm_datasets = {}

for file in os.listdir(sdtm_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(sdtm_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        sdtm_datasets[dataset_name] = df

In [4]:
adam_datasets = {}

for file in os.listdir(adam_path):
    if file.endswith(".sas7bdat"):
        dataset_name = file.replace(".sas7bdat", "")
        file_path = os.path.join(adam_path, file)
        df, meta = pyreadstat.read_sas7bdat(file_path)
        adam_datasets[dataset_name] = df

In [5]:
# Read datasets
adsl = adam_datasets.get("adsl")
advs = adam_datasets.get("advs")

In [6]:

# Sort ADSL by USUBJID (equivalent to PROC SORT)
adsl = adsl.sort_values('USUBJID').reset_index(drop=True)

# Fever Symptom Processing
# Filter ADVS for temperature data (equivalent to WHERE clause)
advs_temp = advs[
    (advs['PARAMCD'] == 'TEMP') & 
    (advs['ADY'] >= 1)
][['USUBJID', 'ADT', 'ADY', 'AVISITN', 'AVISIT', 'PARAMCD', 'PARAM', 'TRTSDT', 'AVAL']].copy()

# Sort by USUBJID, AVAL, AVISITN
advs_temp = advs_temp.sort_values(['USUBJID', 'AVAL', 'AVISITN']).reset_index(drop=True)

# Get maximum severity record (last record per subject)
maxsev_temp = advs_temp.groupby('USUBJID').last().reset_index()

# Merge with ADSL
comb_temp = pd.merge(adsl, maxsev_temp, on='USUBJID', how='left', indicator=True)


In [15]:

# Create event indicators and values
def process_fever_events(row):
    if row['_merge'] == 'both':  # Both A and B (subject has temperature data)
        row['CNSR'] = 0
        row['AVAL'] = row['ADY']
        row['EVNTDESC'] = 'Maximum Temperature'
    else:  # Only A (subject has no temperature data)
        row['ADT'] = np.nan
        row['AVAL'] = np.nan
        row['CNSR'] = np.nan
        row['EVNTDESC'] = ''
    return row

comb_temp = comb_temp.apply(process_fever_events, axis=1)

# Add fever-specific parameters
comb1 = comb_temp.copy()
comb1['PARAMCD'] = 'FEVERSYM'
comb1['PARAM'] = 'Fever Symptom'
comb1['PARAMN'] = 1
comb1['PARCAT1'] = 'Time to Event'
comb1['PARAMTYP'] = 'DERIVED'
comb1['AVALC'] = ''
comb1['ADTF'] = ''
comb1['ANL01FL'] = 'Y'

In [16]:
# Elevated Respiratory Rate Symptom Processing
# Filter ADVS for respiratory rate data
advs_resp = advs[
    (advs['PARAMCD'] == 'RESP') & 
    (advs['ADY'] >= 1)
][['USUBJID', 'ADT', 'ADY', 'AVISITN', 'AVISIT', 'PARAMCD', 'PARAM', 'TRTSDT', 'AVAL']].copy()

# Sort by USUBJID, AVAL, AVISITN
advs_resp = advs_resp.sort_values(['USUBJID', 'AVAL', 'AVISITN']).reset_index(drop=True)

# Get maximum severity record
maxsev_resp = advs_resp.groupby('USUBJID').last().reset_index()

# Merge with ADSL
comb_resp = pd.merge(adsl, maxsev_resp, on='USUBJID', how='left', indicator=True)


In [17]:

# Create event indicators and values
def process_resp_events(row):
    if row['_merge'] == 'both':  # Subject has respiratory data
        row['CNSR'] = 0
        row['AVAL'] = row['ADY']
        row['EVNTDESC'] = 'Maximum Respiratory Rate'
    else:  # Subject has no respiratory data
        row['ADT'] = np.nan
        row['AVAL'] = np.nan
        row['CNSR'] = np.nan
        row['EVNTDESC'] = ''
    return row

comb_resp = comb_resp.apply(process_resp_events, axis=1)

# Add respiratory-specific parameters
comb2 = comb_resp.copy()
comb2['PARAMCD'] = 'ERESPSYM'
comb2['PARAM'] = 'Elevated Respiratory Rate Symptom'
comb2['PARAMN'] = 2
comb2['PARCAT1'] = 'Time to Event'
comb2['PARAMTYP'] = 'DERIVED'
comb2['AVALC'] = ''
comb2['ADTF'] = ''
comb2['ANL01FL'] = 'Y'


In [18]:

# Combine both datasets
all_data = pd.concat([comb1, comb2], ignore_index=True)

# Add additional variables
all_data['AVALU'] = 'DAYS'
all_data['STARTDTF'] = ''
all_data['CNSDTDSC'] = ''

# Sort by USUBJID
all_data = all_data.sort_values('USUBJID').reset_index(drop=True)


In [19]:

# Select final variables (equivalent to KEEP statement)
final_vars = [
    'STUDYID', 'USUBJID', 'SUBJID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE', 
    'ETHNIC', 'COUNTRY', 'SAFFL', 'ITTFL', 'PPROTFL', 'RANDFL', 'TRT01P', 
    'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTSDT', 'TRTEDT', 'PARAM', 'PARAMN', 
    'PARAMCD', 'PARCAT1', 'PARAMTYP', 'AVAL', 'AVALU', 'STARTDT', 'STARTDTF', 
    'ADT', 'ADTF', 'ADY', 'CNSR', 'EVNTDESC', 'CNSDTDSC', 'ANL01FL'
]

In [20]:

# Keep only the required variables that exist in the dataset
available_vars = [var for var in final_vars if var in all_data.columns]
missing_vars = [var for var in final_vars if var not in all_data.columns]

In [21]:
adtte = all_data[available_vars].copy()

In [22]:
column_labels = {
'STUDYID': "Study Identifier",
'USUBJID': "Unique Subject Identifier",
'SUBJID': "Subject Identifier for the Study",
'SITEID': "Study Site Identifier",
'AGE': "Age",
'AGEU': "Age Units",
'SEX': "Sex",
'RACE': "Race",
'ETHNIC': "Ethnicity",
'COUNTRY': "Country",
'SAFFL': "Safety Population Flag",
'ITTFL': "Intent-To-Treat Population Flag",
'PPROTFL': "Per-Protocol Population Flag",
'RANDFL': "Randomized Population Flag",
'TRT01P': "Planned Treatment for Period 01",
'TRT01PN': "Planned Treatment for Period 01 (N)",
'TRT01A': "Actual Treatment for Period 01",
'TRT01AN': "Actual Treatment for Period 01 (N)",
'TRTSDT': "Date of First Exposure to Treatment",
}

In [23]:
adtte.attrs['column_labels'] = column_labels

In [16]:
for col in adtte.columns:
    label = adtte.attrs.get('column_labels', {}).get(col, '')
    print(f"{col}: {label}")

STUDYID: Study Identifier
USUBJID: Unique Subject Identifier
SUBJID: Subject Identifier for the Study
SITEID: Study Site Identifier
AGE: Age
AGEU: Age Units
SEX: Sex
RACE: Race
ETHNIC: Ethnicity
COUNTRY: Country
SAFFL: Safety Population Flag
ITTFL: Intent-To-Treat Population Flag
PPROTFL: Per-Protocol Population Flag
RANDFL: Randomized Population Flag
TRT01P: Planned Treatment for Period 01
TRT01PN: Planned Treatment for Period 01 (N)
TRT01A: Actual Treatment for Period 01
TRT01AN: Actual Treatment for Period 01 (N)
TRTEDT: 
PARAM: 
PARAMN: 
PARAMCD: 
PARCAT1: 
PARAMTYP: 
AVAL: 
AVALU: 
STARTDTF: 
ADT: 
ADTF: 
ADY: 
CNSR: 
EVNTDESC: 
CNSDTDSC: 
ANL01FL: 


In [24]:
#  Save the final dataset
output = "E:/Python Clinical Course/ADAM datasets"
output_path = f"{output}/ADTTE.csv"
adtte.to_csv(output_path, index=False)

print(f"ADAE dataset created successfully with {len(adtte)} subjects and {len(adtte.columns)} variables.")
print(f"Dataset saved to: {output_path}")

# Display basic info about the dataset
print("\nDataset Info:")
print(f"Shape: {adtte.shape}")
print(f"Columns: {list(adtte.columns)}")

ADAE dataset created successfully with 232 subjects and 34 variables.
Dataset saved to: E:/Python Clinical Course/ADAM datasets/ADTTE.csv

Dataset Info:
Shape: (232, 34)
Columns: ['STUDYID', 'USUBJID', 'SUBJID', 'SITEID', 'AGE', 'AGEU', 'SEX', 'RACE', 'ETHNIC', 'COUNTRY', 'SAFFL', 'ITTFL', 'PPROTFL', 'RANDFL', 'TRT01P', 'TRT01PN', 'TRT01A', 'TRT01AN', 'TRTEDT', 'PARAM', 'PARAMN', 'PARAMCD', 'PARCAT1', 'PARAMTYP', 'AVAL', 'AVALU', 'STARTDTF', 'ADT', 'ADTF', 'ADY', 'CNSR', 'EVNTDESC', 'CNSDTDSC', 'ANL01FL']
