In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids

### Import CSV and Perform Data Cleaning

In [69]:
columns = ["status", "actual_duration", "arm_group_type", "intervention_type",
          "interventional_type_model", "phase", "fda_regulated_drug", "fda_regulated_device", "enrollment_count"]

In [70]:
# Load the interventional_study_o|nly
df = pd.read_csv('Tables/updated_ivs_studies.csv')
df

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,2187 days,Recruiting,Interventional,"No Intervention, Experimental, Experimental",Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,1212 days,Recruiting,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),184 days,Not yet recruiting,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,1047 days,Completed,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
4,RO1912-30902,RCT Comparing 2 Radiotherapy HypoFractionation...,762 days,Unknown status,Interventional,"Other, Experimental",Radiation,Parallel Assignment,Not Applicable,No,No,166
...,...,...,...,...,...,...,...,...,...,...,...,...
3913,20201491,Culturally Tailored Nurse Coaching Study for C...,615 days,Recruiting,Interventional,"Experimental, Active Comparator","Behavioral, Behavioral",Parallel Assignment,Not Applicable,No,No,98
3914,32900654326,"TPVB, PECSB, ESPB for Postmastectmy Pain",853 days,Completed,Interventional,"Placebo Comparator, Active Comparator, Active ...","Procedure, Drug",Parallel Assignment,Not Applicable,No,No,80
3915,ReDA 13176,A Randomized Phase III Trial of Stereotactic A...,2192 days,Not yet recruiting,Interventional,"Active Comparator, Experimental","Radiation, Drug, Drug, Drug, Drug, Other, Radi...",Parallel Assignment,Phase 3,No,No,180
3916,64121317.4.1001.5330,Nivolumab in Prostate Cancer With DNA Repair D...,1369 days,"Active, not recruiting",Interventional,Experimental,Drug,Single Group Assignment,Phase 2,Yes,No,38


In [71]:
# Drop null rows
df = df.dropna()

In [72]:
# keep only the first values for 'arm group type' and 'intervention type'
df['arm_group_type'] = df['arm_group_type'].str.split(',').str[0]
df['intervention_type'] = df['intervention_type'].str.split(',').str[0]

In [73]:
df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,2187 days,Recruiting,Interventional,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,1212 days,Recruiting,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),184 days,Not yet recruiting,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,1047 days,Completed,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
4,RO1912-30902,RCT Comparing 2 Radiotherapy HypoFractionation...,762 days,Unknown status,Interventional,Other,Radiation,Parallel Assignment,Not Applicable,No,No,166


In [74]:
# Drop rows with 'unknown status'
df = df.loc[df['status'] != 'Unknown status']

# categorize the status
replace_value = {
    'Completed':'high_chance',
    'Active, not recruiting': 'low_chance',
    'Enrolling by invitation': 'high_chance',
    'Not yet recruiting': 'low_chance',
    'Recruiting': 'high_chance',
    'Suspended': 'low_chance',
    'Terminated': 'low_chance',
    'Withdrawn': 'low_chance'
} 

df = df.replace({"status": replace_value})
df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,2187 days,high_chance,Interventional,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,1212 days,high_chance,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),184 days,low_chance,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,1047 days,high_chance,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
5,UMCC 2021.087,Avoiding Low-value Treatments in Older Women W...,354 days,high_chance,Interventional,Experimental,Behavioral,Single Group Assignment,Not Applicable,No,No,40


In [91]:
###### TO BE REMOVED ONCE DATA IS UPDATED

df['actual_duration'] = pd.to_numeric(df['actual_duration'].astype(str).str.replace('days', ''), errors='coerce')

In [92]:
ml_df = df[columns]
ml_df

Unnamed: 0,status,actual_duration,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,high_chance,2187,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,high_chance,1212,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,low_chance,184,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,high_chance,1047,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
5,high_chance,354,Experimental,Behavioral,Single Group Assignment,Not Applicable,No,No,40
...,...,...,...,...,...,...,...,...,...
3913,high_chance,615,Experimental,Behavioral,Parallel Assignment,Not Applicable,No,No,98
3914,high_chance,853,Placebo Comparator,Procedure,Parallel Assignment,Not Applicable,No,No,80
3915,low_chance,2192,Active Comparator,Radiation,Parallel Assignment,Phase 3,No,No,180
3916,low_chance,1369,Experimental,Drug,Single Group Assignment,Phase 2,Yes,No,38


### Split the Data into Training and Testing

In [101]:
# Encode all columns into numbers to run through machine learning
ml_df['status'] = LabelEncoder().fit_transform(ml_df['status'])
y = ml_df.status
y


0       0
1       0
2       1
3       0
5       0
       ..
3913    0
3914    0
3915    1
3916    1
3917    0
Name: status, Length: 3578, dtype: int64

In [None]:
# prepare columns for machine learning through get_dummies
X = pd.get_dummies(ml_df,columns=['arm_group_type', 'intervention_type', 'interventional_type_model', 'phase', 'fda_regulated_drug', 'fda_regulated_device'])
X = X.drop('status', axis=1)
X.head()

In [94]:
# Check count, mean, max for features
X.describe()

Unnamed: 0,actual_duration,enrollment_count,arm_group_type_Active Comparator,arm_group_type_Experimental,arm_group_type_No Intervention,arm_group_type_Other,arm_group_type_Placebo Comparator,arm_group_type_Sham Comparator,intervention_type_Behavioral,intervention_type_Biological,...,phase_Phase 1,"phase_Phase 1, Phase 2",phase_Phase 2,"phase_Phase 2, Phase 3",phase_Phase 3,phase_Phase 4,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
count,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,...,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0,3578.0
mean,1276.402739,497.11431,0.108161,0.810509,0.023477,0.046115,0.01062,0.001118,0.107323,0.050028,...,0.131917,0.079374,0.222191,0.011738,0.080492,0.024595,0.702907,0.297093,0.942705,0.057295
std,934.022675,4377.375042,0.310627,0.391953,0.151433,0.209764,0.102521,0.033422,0.309566,0.218033,...,0.338448,0.270359,0.415777,0.107721,0.272091,0.154908,0.457042,0.457042,0.232437,0.232437
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,634.0,31.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1096.0,72.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,1686.75,194.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,7329.0,100000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [95]:
# Check the balance of our target values
y.value_counts()

0    2418
1    1160
Name: status, dtype: int64

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 1813, 1: 870})
Counter({0: 605, 1: 290})


### Balanced Random Forest Classifier

In [97]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [98]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)

0.5858506697064692

In [102]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,373,232
Actual Low chance,129,161


In [103]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,373,232
Actual Low chance,129,161


Accuracy Score: 0.5966480446927375
Balaned Accuracy Score: 0.5858506697064692
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.74      0.62      0.56      0.67      0.59      0.34       605
          1       0.41      0.56      0.62      0.47      0.59      0.34       290

avg / total       0.64      0.60      0.58      0.61      0.59      0.34       895



In [104]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(classifier.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

enrollment_count: (0.3800819849304627)
actual_duration: (0.35583775234713466)
interventional_type_model_Parallel Assignment: (0.01594048548602891)
arm_group_type_Experimental: (0.014382396755400043)
intervention_type_Drug: (0.014326618788204062)
phase_Phase 2: (0.013900150884593586)
interventional_type_model_Single Group Assignment: (0.013734239433313283)
phase_Not Applicable: (0.011042349586364691)
arm_group_type_Active Comparator: (0.010346862546360388)
phase_Phase 1: (0.01026994312731236)
fda_regulated_drug_No: (0.010170828352112388)
interventional_type_model_Sequential Assignment: (0.01014914391711945)
fda_regulated_drug_Yes: (0.010001061258873324)
intervention_type_Procedure: (0.00918976563139805)
intervention_type_Other: (0.008994692966171246)
phase_Phase 1, Phase 2: (0.008955846003487655)
intervention_type_Device: (0.008115561475161402)
arm_group_type_Other: (0.00782425324871133)
intervention_type_Radiation: (0.007786619040799956)
intervention_type_Biological: (0.007605271414263

### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

In [105]:
# Resample with RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [106]:
# Train through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# fit model
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [107]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [109]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,0,605
Actual Low chance,1,289


In [110]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,0,605
Actual Low chance,1,289


Accuracy Score: 0.3229050279329609
Balaned Accuracy Score: 0.4982758620689655
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       605
          1       0.32      1.00      0.00      0.49      0.00      0.00       290

avg / total       0.10      0.32      0.67      0.16      0.00      0.00       895



#### Undersampling

In [112]:
# Resample the data using the ClusterCentroids resampler
Cluster_resample = ClusterCentroids(random_state=1)
X_resampled, y_resampled = Cluster_resample.fit_resample(X_train, y_train)

In [113]:
# Train & fit the model through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [114]:
# Calculate predictions and the balanced accuracy score
y_pred = classifier.predict(X_test)

In [115]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,136,469
Actual Low chance,64,226


In [116]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,136,469
Actual Low chance,64,226


Accuracy Score: 0.4044692737430168
Balaned Accuracy Score: 0.5020518666286692
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.22      0.78      0.34      0.42      0.17       605
          1       0.33      0.78      0.22      0.46      0.42      0.18       290

avg / total       0.57      0.40      0.60      0.38      0.42      0.17       895



#### Combination (Over and Under) Sampling

In [117]:
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

In [118]:
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [119]:
# calculate prediction & balanced accuracy score
y_pred = classifier.predict(X_test)

In [120]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,391,214
Actual Low chance,153,137


In [121]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,391,214
Actual Low chance,153,137


Accuracy Score: 0.5899441340782123
Balaned Accuracy Score: 0.5593473924194927
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.65      0.47      0.68      0.55      0.31       605
          1       0.39      0.47      0.65      0.43      0.55      0.30       290

avg / total       0.61      0.59      0.53      0.60      0.55      0.31       895

