In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Import CSV and Perform Data Cleaning

In [4]:
columns = ["actual_duration", "study_type", "fda_regulated_drug",
           "fda_regulated_device", "responsible_party", "enrollemnt_count"]

target = ["status"]

In [5]:
# Load the interventional_study_only
file_path = Path("ivs_vs_obs.csv")
df = pd.read_csv(file_path)

In [6]:
# Examine DataFrame
df.dtypes

ID                      object
title                   object
actual_duration         object
status                  object
study_type              object
fda_regulated_drug      object
fda_regulated_device    object
responsible_party       object
enrollment_count         int64
dtype: object

In [7]:
# Drop null row .dropna()
df = df.dropna()

# Drop ID and title columns
df = df.drop(['ID', 'title'], axis=1)
df

Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,2788 days,Recruiting,Observational,No,No,Sponsor,3000
1,366 days,Not yet recruiting,Observational,No,No,Principal Investigator,2300
2,253 days,Unknown status,Observational,No,No,Principal Investigator,80
3,1765 days,Recruiting,Observational,No,No,Sponsor,30
4,1437 days,Unknown status,Observational,No,No,Sponsor,600
...,...,...,...,...,...,...,...
4993,853 days,Completed,Interventional,No,No,Principal Investigator,80
4994,2192 days,Not yet recruiting,Interventional,No,No,Sponsor-Investigator,180
4995,1369 days,"Active, not recruiting",Interventional,Yes,No,Sponsor,38
4996,2190 days,Recruiting,Interventional,No,No,Sponsor-Investigator,80


In [8]:
# Take "days" out of actual_duration
df['actual_duration'] = pd.to_numeric(df['actual_duration'].astype(str).str.replace('days', ''), errors='coerce')

# Change actual_duration datatype to int
df['actual_duration'] = df['actual_duration'].astype(int)
df

Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,2788,Recruiting,Observational,No,No,Sponsor,3000
1,366,Not yet recruiting,Observational,No,No,Principal Investigator,2300
2,253,Unknown status,Observational,No,No,Principal Investigator,80
3,1765,Recruiting,Observational,No,No,Sponsor,30
4,1437,Unknown status,Observational,No,No,Sponsor,600
...,...,...,...,...,...,...,...
4993,853,Completed,Interventional,No,No,Principal Investigator,80
4994,2192,Not yet recruiting,Interventional,No,No,Sponsor-Investigator,180
4995,1369,"Active, not recruiting",Interventional,Yes,No,Sponsor,38
4996,2190,Recruiting,Interventional,No,No,Sponsor-Investigator,80


In [9]:
duration_avr = df['actual_duration'].mean()
print(duration_avr)

1222.892017828201


In [10]:
# Remove "Unknown" ("recruiting", "not yet recruiting", "Active, not recruiting", "Enrolling by invitation"?) status
unknown_stat = df['status'] != 'Unknown status'
df = df.loc[unknown_stat]

# Take out all other extras status
# not_yet_recruiting_stat = df['status'] != 'Not yet recruiting'
# df = df.loc[not_yet_recruiting_stat]

# active_not_recruiting_stat = df['status'] != 'Active, not recruiting'
# df = df.loc[active_not_recruiting_stat]

# enrolling_by_invitation_stat = df['status'] != 'Enrolling by invitation' 
# df = df.loc[enrolling_by_invitation_stat]

# recruiting_stat = df['status'] != 'Recruiting'
# df = df.loc[recruiting_stat]

df


Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,2788,Recruiting,Observational,No,No,Sponsor,3000
1,366,Not yet recruiting,Observational,No,No,Principal Investigator,2300
3,1765,Recruiting,Observational,No,No,Sponsor,30
6,975,Recruiting,Observational,No,No,Principal Investigator,271
7,990,Recruiting,Observational,No,No,Sponsor,316
...,...,...,...,...,...,...,...
4993,853,Completed,Interventional,No,No,Principal Investigator,80
4994,2192,Not yet recruiting,Interventional,No,No,Sponsor-Investigator,180
4995,1369,"Active, not recruiting",Interventional,Yes,No,Sponsor,38
4996,2190,Recruiting,Interventional,No,No,Sponsor-Investigator,80


In [11]:
# Convert the target Column values to low_chance and high_chance based in their values
# x = {'Completed' : 'high_chance'}
# df = df.replace(x)

x = dict.fromkeys(['Completed', 'Recruiting', 'Enrolling by invitation', 'Active, not recruiting'], 'high_chance')
df = df.replace(x)

x = dict.fromkeys(['Withdrawn', 'Terminated', 'Suspended', 'Not yet recruiting'], 'low_chance')
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,2788,high_chance,Observational,No,No,Sponsor,3000
1,366,low_chance,Observational,No,No,Principal Investigator,2300
2,1765,high_chance,Observational,No,No,Sponsor,30
3,975,high_chance,Observational,No,No,Principal Investigator,271
4,990,high_chance,Observational,No,No,Sponsor,316


### Split the Data into Training and Testing

In [12]:
# Create our features
X = pd.get_dummies(df, columns=['study_type', 'fda_regulated_drug', 'fda_regulated_device', 
                                'responsible_party'], dtype=int).drop('status', axis=1)

# Create our target
y = df['status']
X.head()

Unnamed: 0,actual_duration,enrollment_count,study_type_Interventional,study_type_Observational,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes,responsible_party_Principal Investigator,responsible_party_Sponsor,responsible_party_Sponsor-Investigator
0,2788,3000,0,1,1,0,1,0,0,1,0
1,366,2300,0,1,1,0,1,0,1,0,0
2,1765,30,0,1,1,0,1,0,0,1,0
3,975,271,0,1,1,0,1,0,1,0,0
4,990,316,0,1,1,0,1,0,0,1,0


In [13]:
# Check count, mean, max for features
X.describe()

Unnamed: 0,actual_duration,enrollment_count,study_type_Interventional,study_type_Observational,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes,responsible_party_Principal Investigator,responsible_party_Sponsor,responsible_party_Sponsor-Investigator
count,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0,4481.0
mean,1274.214907,5296.923,0.80116,0.19884,0.752287,0.247713,0.943763,0.056237,0.344343,0.609685,0.045972
std,1025.855744,211441.0,0.399172,0.399172,0.431732,0.431732,0.230405,0.230405,0.475207,0.487875,0.209448
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,610.0,36.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,1094.0,84.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,1666.0,225.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,18537.0,10000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
# Check the balance of our target values
y.value_counts()

status
high_chance    3586
low_chance      895
Name: count, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

Counter({'high_chance': 2689, 'low_chance': 671})
Counter({'high_chance': 897, 'low_chance': 224})


### Balanced Random Forest Classifier

In [16]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

In [17]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6118012422360248

In [18]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[585, 312],
       [ 96, 128]], dtype=int64)

In [19]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual low_chance", "Actual high_chance"], columns=["Predicted low_chance", "Predicted high_chance"])
cm_df

Unnamed: 0,Predicted low_chance,Predicted high_chance
Actual low_chance,585,312
Actual high_chance,96,128


In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.86      0.65      0.57      0.74      0.61      0.38       897
 low_chance       0.29      0.57      0.65      0.39      0.61      0.37       224

avg / total       0.75      0.64      0.59      0.67      0.61      0.37      1121



In [21]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(classifier.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

enrollment_count: (0.4688413879507043)
actual_duration: (0.46397876682446443)
responsible_party_Principal Investigator: (0.010502222909438039)
responsible_party_Sponsor: (0.009944030791559307)
fda_regulated_drug_Yes: (0.008408128473399279)
fda_regulated_drug_No: (0.00839357349153762)
study_type_Observational: (0.007089234845942614)
study_type_Interventional: (0.0065251463283048285)
fda_regulated_device_Yes: (0.005983615873617721)
fda_regulated_device_No: (0.005649652785452187)
responsible_party_Sponsor-Investigator: (0.004684239725579856)


### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

#### Undersampling

#### Combination (Over and Under) Sampling