In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids

### Import CSV and Perform Data Cleaning

In [5]:
columns = ["status", "actual_duration", "arm_group_type", "intervention_type",
          "interventional_type_model", "phase", "fda_regulated_drug", "fda_regulated_device", "enrollment_count"]

In [6]:
# Load the interventional_study_o|nly
df = pd.read_csv('../Tables/updated_ivs_studies.csv')
df

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,5.991781,Recruiting,Interventional,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,3.320548,Recruiting,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),0.504110,Not yet recruiting,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,2.868493,Completed,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
4,RO1912-30902,RCT Comparing 2 Radiotherapy HypoFractionation...,2.087671,Unknown status,Interventional,Other,Radiation,Parallel Assignment,Not Applicable,No,No,166
...,...,...,...,...,...,...,...,...,...,...,...,...
3931,20201491,Culturally Tailored Nurse Coaching Study for C...,1.684932,Recruiting,Interventional,Experimental,Behavioral,Parallel Assignment,Not Applicable,No,No,98
3932,32900654326,"TPVB, PECSB, ESPB for Postmastectmy Pain",2.336986,Completed,Interventional,Placebo Comparator,Procedure,Parallel Assignment,Not Applicable,No,No,80
3933,ReDA 13176,A Randomized Phase III Trial of Stereotactic A...,6.005479,Not yet recruiting,Interventional,Active Comparator,Radiation,Parallel Assignment,Phase 3,No,No,180
3934,64121317.4.1001.5330,Nivolumab in Prostate Cancer With DNA Repair D...,3.750685,"Active, not recruiting",Interventional,Experimental,Drug,Single Group Assignment,Phase 2,Yes,No,38


In [7]:
# Drop null rows
df = df.dropna()

In [8]:
# keep only the first values for 'arm group type' and 'intervention type'
df['arm_group_type'] = df['arm_group_type'].str.split(',').str[0]
df['intervention_type'] = df['intervention_type'].str.split(',').str[0]

In [9]:
df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,5.991781,Recruiting,Interventional,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,3.320548,Recruiting,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),0.50411,Not yet recruiting,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,2.868493,Completed,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
4,RO1912-30902,RCT Comparing 2 Radiotherapy HypoFractionation...,2.087671,Unknown status,Interventional,Other,Radiation,Parallel Assignment,Not Applicable,No,No,166


In [10]:
# Drop rows with 'unknown status'
df = df.loc[df['status'] != 'Unknown status']

# categorize the status
replace_value = {
    'Completed':'high_chance',
    'Active, not recruiting': 'low_chance',
    'Enrolling by invitation': 'high_chance',
    'Not yet recruiting': 'low_chance',
    'Recruiting': 'high_chance',
    'Suspended': 'low_chance',
    'Terminated': 'low_chance',
    'Withdrawn': 'low_chance'
} 

df = df.replace({"status": replace_value})
df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,IRST174.22,What is the Best Interval to Screen Women 45-4...,5.991781,high_chance,Interventional,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,GCO 17-2188,Increasing African Immigrant Womens Participat...,3.320548,high_chance,Interventional,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,233756,The HIFUB Study (HIFU in Breast Cancer),0.50411,low_chance,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,D16196,A Pilot Multi-Institutional Study to Evaluate ...,2.868493,high_chance,Interventional,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
5,UMCC 2021.087,Avoiding Low-value Treatments in Older Women W...,0.969863,high_chance,Interventional,Experimental,Behavioral,Single Group Assignment,Not Applicable,No,No,40


In [11]:
ml_df = df[columns]
ml_df

Unnamed: 0,status,actual_duration,arm_group_type,intervention_type,interventional_type_model,phase,fda_regulated_drug,fda_regulated_device,enrollment_count
0,high_chance,5.991781,No Intervention,Diagnostic Test,Parallel Assignment,Not Applicable,No,No,60000
1,high_chance,3.320548,Other,Behavioral,Single Group Assignment,Not Applicable,No,No,168
2,low_chance,0.504110,Experimental,Device,Single Group Assignment,Not Applicable,No,No,15
3,high_chance,2.868493,Experimental,Device,Single Group Assignment,Not Applicable,No,Yes,18
5,high_chance,0.969863,Experimental,Behavioral,Single Group Assignment,Not Applicable,No,No,40
...,...,...,...,...,...,...,...,...,...
3931,high_chance,1.684932,Experimental,Behavioral,Parallel Assignment,Not Applicable,No,No,98
3932,high_chance,2.336986,Placebo Comparator,Procedure,Parallel Assignment,Not Applicable,No,No,80
3933,low_chance,6.005479,Active Comparator,Radiation,Parallel Assignment,Phase 3,No,No,180
3934,low_chance,3.750685,Experimental,Drug,Single Group Assignment,Phase 2,Yes,No,38


### Split the Data into Training and Testing

In [12]:
# Encode all columns into numbers to run through machine learning
ml_df['status'] = LabelEncoder().fit_transform(ml_df['status'])
y = ml_df.status
y


0       0
1       0
2       1
3       0
5       0
       ..
3931    0
3932    0
3933    1
3934    1
3935    0
Name: status, Length: 3599, dtype: int32

In [13]:
# prepare columns for machine learning through get_dummies
X = pd.get_dummies(ml_df,columns=['arm_group_type', 'intervention_type', 'interventional_type_model', 'phase', 'fda_regulated_drug', 'fda_regulated_device'])
X = X.drop('status', axis=1)
X.head()

Unnamed: 0,actual_duration,enrollment_count,arm_group_type_Active Comparator,arm_group_type_Experimental,arm_group_type_No Intervention,arm_group_type_Other,arm_group_type_Placebo Comparator,arm_group_type_Sham Comparator,intervention_type_Behavioral,intervention_type_Biological,...,phase_Phase 1,"phase_Phase 1, Phase 2",phase_Phase 2,"phase_Phase 2, Phase 3",phase_Phase 3,phase_Phase 4,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
0,5.991781,60000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,3.320548,168,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
2,0.50411,15,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,2.868493,18,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
5,0.969863,40,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0


In [14]:
# Check count, mean, max for features
X.describe()

Unnamed: 0,actual_duration,enrollment_count,arm_group_type_Active Comparator,arm_group_type_Experimental,arm_group_type_No Intervention,arm_group_type_Other,arm_group_type_Placebo Comparator,arm_group_type_Sham Comparator,intervention_type_Behavioral,intervention_type_Biological,...,phase_Phase 1,"phase_Phase 1, Phase 2",phase_Phase 2,"phase_Phase 2, Phase 3",phase_Phase 3,phase_Phase 4,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
count,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,...,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0,3599.0
mean,3.488654,496.203668,0.108363,0.810503,0.023618,0.045846,0.010558,0.001111,0.107252,0.049458,...,0.131148,0.079744,0.221728,0.01167,0.0803,0.024451,0.704084,0.295916,0.94304,0.05696
std,2.545801,4365.023155,0.310882,0.391957,0.151876,0.20918,0.102225,0.033324,0.309476,0.216853,...,0.337609,0.270935,0.415467,0.10741,0.271795,0.154467,0.456517,0.456517,0.231799,0.231799
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.732877,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,3.00274,72.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,4.627397,194.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,20.079452,100000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# Check the balance of our target values
y.value_counts()

0    2428
1    1171
Name: status, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 1821, 1: 878})
Counter({0: 607, 1: 293})


### Balanced Random Forest Classifier

In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)

In [19]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,384,223
Actual Low chance,133,160


In [20]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Balanced Random Forest Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,384,223
Actual Low chance,133,160


Accuracy Score: 0.6044444444444445
Balaned Accuracy Score: 0.5893472625962182
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.74      0.63      0.55      0.68      0.59      0.35       607
          1       0.42      0.55      0.63      0.47      0.59      0.34       293

avg / total       0.64      0.60      0.57      0.61      0.59      0.35       900



In [21]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(classifier.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

enrollment_count: (0.3749587751550118)
actual_duration: (0.3578212050145276)
intervention_type_Drug: (0.015813312390828633)
interventional_type_model_Parallel Assignment: (0.015575240661273687)
interventional_type_model_Single Group Assignment: (0.015516627002306492)
arm_group_type_Experimental: (0.01383525663002379)
phase_Phase 2: (0.013800382544429694)
arm_group_type_Active Comparator: (0.010830936385688819)
phase_Not Applicable: (0.010655991065697654)
fda_regulated_drug_Yes: (0.0101985455887157)
intervention_type_Other: (0.010168869956130026)
phase_Phase 1: (0.00989572820239383)
interventional_type_model_Sequential Assignment: (0.00968442233584655)
fda_regulated_drug_No: (0.009425127540226315)
phase_Phase 1, Phase 2: (0.00893410560046069)
intervention_type_Procedure: (0.008901273880136151)
intervention_type_Device: (0.008357931417346186)
intervention_type_Behavioral: (0.008195394124195034)
intervention_type_Biological: (0.007555944687210072)
arm_group_type_Other: (0.0074651057645970

### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

In [22]:
# Resample with RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [23]:
# Train through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# fit model
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [24]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [25]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,355,252
Actual Low chance,138,155


In [26]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Random Over Sampling Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Over Sampling Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,355,252
Actual Low chance,138,155


Accuracy Score: 0.5666666666666667
Balaned Accuracy Score: 0.5569268657471704
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.58      0.53      0.65      0.56      0.31       607
          1       0.38      0.53      0.58      0.44      0.56      0.31       293

avg / total       0.61      0.57      0.55      0.58      0.56      0.31       900



#### Undersampling

In [27]:
# Resample the data using the ClusterCentroids resampler
Cluster_resample = ClusterCentroids(random_state=1)
X_resampled, y_resampled = Cluster_resample.fit_resample(X_train, y_train)

In [28]:
# Train & fit the model through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Calculate predictions and the balanced accuracy score
y_pred = classifier.predict(X_test)

In [30]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,165,442
Actual Low chance,87,206


In [32]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Cluster Centroids Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,165,442
Actual Low chance,87,206


Accuracy Score: 0.4122222222222222
Balaned Accuracy Score: 0.48745016896165894
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.65      0.27      0.70      0.38      0.44      0.18       607
          1       0.32      0.70      0.27      0.44      0.44      0.20       293

avg / total       0.55      0.41      0.56      0.40      0.44      0.19       900



#### Combination (Over and Under) Sampling

In [33]:
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

In [34]:
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [35]:
# calculate prediction & balanced accuracy score
y_pred = classifier.predict(X_test)

In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,310,297
Actual Low chance,127,166


In [37]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("SMOTEENN Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,310,297
Actual Low chance,127,166


Accuracy Score: 0.5288888888888889
Balaned Accuracy Score: 0.5386306515004133
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.71      0.51      0.57      0.59      0.54      0.29       607
          1       0.36      0.57      0.51      0.44      0.54      0.29       293

avg / total       0.60      0.53      0.55      0.54      0.54      0.29       900

