In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("unique_trial_data.csv")
df.head()

Unnamed: 0,nct_id,phase,condition,intervention_type,study_design,sponsor_type,enrollment,enrollment_type,status,gender,location,start_date,completion_date,masking
0,NCT00000102,PHASE1/PHASE2,Congenital Adrenal Hyperplasia,DRUG,,NIH,,,COMPLETED,ALL,South Carolina,,,DOUBLE
1,NCT00000104,,Lead Poisoning,PROCEDURE,,FED,,,COMPLETED,FEMALE,Minnesota,,,
2,NCT00000105,,Cancer,BIOLOGICAL,,OTHER,112.0,ACTUAL,TERMINATED,ALL,Minnesota,2002-07-31,2012-03-31,
3,NCT00000106,,Rheumatic Diseases,DEVICE,RANDOMIZED,NIH,,,UNKNOWN,ALL,Wisconsin,,,
4,NCT00000107,,"Heart Defects, Congenital",,,NIH,,,COMPLETED,ALL,Vermont,,,


In [3]:
df = df.drop('enrollment_type', axis=1)
df = df.drop('nct_id', axis=1)

In [4]:
df['study_design'].unique()

array([nan, 'RANDOMIZED', 'NON_RANDOMIZED'], dtype=object)

In [5]:
df['study_design'].value_counts()

study_design
RANDOMIZED        154593
NON_RANDOMIZED     27401
Name: count, dtype: int64

In [6]:
df.dropna(subset=['phase'], inplace=True)


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91084 entries, 9 to 299998
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   phase              91084 non-null  object 
 1   condition          91084 non-null  object 
 2   intervention_type  91084 non-null  object 
 3   study_design       91084 non-null  object 
 4   sponsor_type       91084 non-null  object 
 5   enrollment         91084 non-null  float64
 6   status             91084 non-null  object 
 7   gender             91084 non-null  object 
 8   location           91084 non-null  object 
 9   masking            91084 non-null  object 
 10  duration           91084 non-null  float64
dtypes: float64(2), object(9)
memory usage: 8.3+ MB


In [8]:
df = df[df['status'].isin(['COMPLETED', 'TERMINATED', 'WITHDRAWN'])]

In [19]:
df['status'].value_counts()

status
COMPLETED     76797
TERMINATED    10672
WITHDRAWN      3615
Name: count, dtype: int64

In [None]:
df.dropna(subset=['study_design'], inplace=True)

In [None]:
df.dropna(subset=['masking'], inplace=True)

In [18]:
df.dropna(subset=['gender'], inplace=True)

In [21]:
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df['completion_date'] = pd.to_datetime(df['completion_date'], errors='coerce')
df['duration'] = (df['completion_date'] - df['start_date']).dt.days

In [23]:
mean_enrollment = df['enrollment'].mean()
mean_duration = df['duration'].mean()
df['enrollment'] = df['enrollment'].fillna(mean_enrollment)
df['duration'] = df['duration'].fillna(mean_duration)

In [26]:
df['location'] = df['location'].fillna("other")

In [28]:
df = df.drop('start_date', axis=1)
df = df.drop('completion_date', axis=1)

In [30]:
completed_sample = df[df['status'] == 'COMPLETED'].sample(n=20000, random_state=42)

terminated_all = df[df['status'] == 'TERMINATED']
withdrawn_all = df[df['status'] == 'WITHDRAWN']

df_balanced = pd.concat([completed_sample, terminated_all, withdrawn_all], ignore_index=True)

In [33]:
df_balanced['final_status'] = np.where(df_balanced['status'] == 'COMPLETED', 1, 0)

In [34]:
df_balanced = df_balanced.drop('status', axis=1)

In [35]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34287 entries, 0 to 34286
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   phase              34287 non-null  object 
 1   condition          34287 non-null  object 
 2   intervention_type  34287 non-null  object 
 3   study_design       34287 non-null  object 
 4   sponsor_type       34287 non-null  object 
 5   enrollment         34287 non-null  float64
 6   gender             34287 non-null  object 
 7   location           34287 non-null  object 
 8   masking            34287 non-null  object 
 9   duration           34287 non-null  float64
 10  final_status       34287 non-null  int64  
dtypes: float64(2), int64(1), object(8)
memory usage: 2.9+ MB


In [36]:
categorical_cols = [
    'phase', 'condition', 'intervention_type', 'study_design',
    'sponsor_type', 'gender', 'location', 'masking'
]

# Dictionaries to store the mapping for each column
label_mappings = {}

for col in categorical_cols:
    # Get unique values and sort them for consistent mapping
    unique_vals = sorted(df_balanced[col].unique())
    mapping = {val: idx for idx, val in enumerate(unique_vals)}
    # Save mapping
    label_mappings[col] = mapping
    # Apply mapping to the column
    df_balanced[col + '_encoded'] = df_balanced[col].map(mapping)

# Display the mapping for each column
for col, mapping in label_mappings.items():
    print(f"Mapping for column '{col}':")
    for k, v in mapping.items():
        print(f"  {k}: {v}")
    print()

# Now df has new columns like 'phase_encoded', 'condition_encoded', etc.


Mapping for column 'phase':
  EARLY_PHASE1: 0
  PHASE1: 1
  PHASE1/PHASE2: 2
  PHASE2: 3
  PHASE2/PHASE3: 4
  PHASE3: 5
  PHASE4: 6

Mapping for column 'condition':
  "Wet" Age-Related Macular Degeneration: 0
  - HIV: 1
  - Medico-Economic Aspects (Evaluation of Medical Costs Related to the Three Strategies and Evaluation of Cost/Efficacy): 2
  2009 H1N1 Influenza: 3
  3rd Line GIST: 4
  50 % Reduction of Delayed Gastric Emptying: 5
  A Total of 234 Patients With Acute Coronary Syndrome Who Will Undergo OPCAB.: 6
  ACE Inhibitor Induced Angioedema: 7
  ACE Inhibitor-associated Angioedema: 8
  ACL - Anterior Cruciate Ligament Rupture: 9
  ACL Repair: 10
  ACL Surgery: 11
  ACOS (Fixed Airflow Obstruction and Elevated Eosinophils): 12
  ADHD: 13
  ADHD - Inattentive Type: 14
  ADHD With Sleep Onset Insomnia: 15
  ADPKD: 16
  AIDS: 17
  AIDS Related Lymphoma: 18
  AIDS Vaccines: 19
  AIDS-Related Plasmablastic Lymphoma: 20
  AIDS-related Kaposi Sarcoma: 21
  AIDS-related Kaposi's Sarcoma:

In [38]:
df_balanced.head()

Unnamed: 0,phase,condition,intervention_type,study_design,sponsor_type,enrollment,gender,location,masking,duration,final_status,phase_encoded,condition_encoded,intervention_type_encoded,study_design_encoded,sponsor_type_encoded,gender_encoded,location_encoded,masking_encoded
0,PHASE2,Acute Myeloid Leukemia (AML),DRUG,RANDOMIZED,INDUSTRY,276.0,ALL,other,QUADRUPLE,2009.0,1,3,238,6,1,3,0,1179,2
1,PHASE1,Neoplasms,DRUG,RANDOMIZED,INDUSTRY,53.0,ALL,other,NONE,992.442074,1,1,5511,6,1,3,0,1179,1
2,PHASE3,"Carcinoma, Hepatocellular",DRUG,RANDOMIZED,INDUSTRY,371.0,ALL,other,NONE,1096.0,1,5,1405,6,1,3,0,1179,1
3,PHASE2,HIV,DRUG,RANDOMIZED,OTHER,20.0,ALL,Illinois,NONE,974.0,1,3,3424,6,1,6,0,533,1
4,PHASE1/PHASE2,Fallopian Tube Cancer,DRUG,NON_RANDOMIZED,NIH,58.0,FEMALE,Texas,NONE,2130.0,1,2,3001,6,0,5,1,1077,1


In [40]:
from sklearn.preprocessing import StandardScaler

# Select the columns to standardize
scaler = StandardScaler()
df_balanced[['enrollment_standardized', 'duration_standardized']] = scaler.fit_transform(df_balanced[['enrollment', 'duration']])


In [41]:
df_balanced.head()

Unnamed: 0,phase,condition,intervention_type,study_design,sponsor_type,enrollment,gender,location,masking,duration,...,phase_encoded,condition_encoded,intervention_type_encoded,study_design_encoded,sponsor_type_encoded,gender_encoded,location_encoded,masking_encoded,enrollment_standardized,duration_standardized
0,PHASE2,Acute Myeloid Leukemia (AML),DRUG,RANDOMIZED,INDUSTRY,276.0,ALL,other,QUADRUPLE,2009.0,...,3,238,6,1,3,0,1179,2,0.014642,1.175545
1,PHASE1,Neoplasms,DRUG,RANDOMIZED,INDUSTRY,53.0,ALL,other,NONE,992.442074,...,1,5511,6,1,3,0,1179,1,-0.096348,-0.002716
2,PHASE3,"Carcinoma, Hepatocellular",DRUG,RANDOMIZED,INDUSTRY,371.0,ALL,other,NONE,1096.0,...,5,1405,6,1,3,0,1179,1,0.061925,0.117315
3,PHASE2,HIV,DRUG,RANDOMIZED,OTHER,20.0,ALL,Illinois,NONE,974.0,...,3,3424,6,1,6,0,533,1,-0.112772,-0.024091
4,PHASE1/PHASE2,Fallopian Tube Cancer,DRUG,NON_RANDOMIZED,NIH,58.0,FEMALE,Texas,NONE,2130.0,...,2,3001,6,0,5,1,1077,1,-0.093859,1.315792


In [42]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34287 entries, 0 to 34286
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   phase                      34287 non-null  object 
 1   condition                  34287 non-null  object 
 2   intervention_type          34287 non-null  object 
 3   study_design               34287 non-null  object 
 4   sponsor_type               34287 non-null  object 
 5   enrollment                 34287 non-null  float64
 6   gender                     34287 non-null  object 
 7   location                   34287 non-null  object 
 8   masking                    34287 non-null  object 
 9   duration                   34287 non-null  float64
 10  final_status               34287 non-null  int64  
 11  phase_encoded              34287 non-null  int64  
 12  condition_encoded          34287 non-null  int64  
 13  intervention_type_encoded  34287 non-null  int

In [43]:
columns_to_drop = [
    'phase', 'condition', 'intervention_type', 'study_design',
    'sponsor_type', 'enrollment_standardized', 'gender', 'location', 'masking', 'duration_standardized'
]

df_balanced = df_balanced.drop(columns=columns_to_drop)


In [44]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34287 entries, 0 to 34286
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   enrollment                 34287 non-null  float64
 1   duration                   34287 non-null  float64
 2   final_status               34287 non-null  int64  
 3   phase_encoded              34287 non-null  int64  
 4   condition_encoded          34287 non-null  int64  
 5   intervention_type_encoded  34287 non-null  int64  
 6   study_design_encoded       34287 non-null  int64  
 7   sponsor_type_encoded       34287 non-null  int64  
 8   gender_encoded             34287 non-null  int64  
 9   location_encoded           34287 non-null  int64  
 10  masking_encoded            34287 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 2.9 MB


In [45]:
# Get a list of columns, excluding 'final_status'
cols = [col for col in df_balanced.columns if col != 'final_status']
# Add 'final_status' at the end
cols.append('final_status')
# Reorder the DataFrame
df_balanced = df_balanced[cols]

In [46]:
df_balanced.to_csv('trainable.csv', index=False)

In [47]:
df_balanced['final_status'].value_counts()

final_status
1    20000
0    14287
Name: count, dtype: int64