In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.metrics import classification_report_imbalanced

### Import CSV and Perform Data Cleaning

In [4]:
columns = ["actual_duration", "study_type", "fda_regulated_drug",
           "fda_regulated_device", "responsible_party", "enrollemnt_count"]

target = ["status"]

In [5]:
# Load the interventional_study_only
file_path = Path('../Tables/ivs_vs_obs.csv')
df = pd.read_csv(file_path)

In [6]:
# Examine DataFrame
df.dtypes

ID                       object
title                    object
actual_duration         float64
status                   object
study_type               object
fda_regulated_drug       object
fda_regulated_device     object
responsible_party        object
enrollment_count          int64
dtype: object

In [7]:
# Drop null row .dropna()
df = df.dropna()

# Drop ID and title columns
df = df.drop(['ID', 'title'], axis=1)
df

Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,7.638356,Recruiting,Observational,No,No,Sponsor,3000
1,1.002740,Not yet recruiting,Observational,No,No,Principal Investigator,2300
2,0.693151,Unknown status,Observational,No,No,Principal Investigator,80
3,4.835616,Recruiting,Observational,No,No,Sponsor,30
4,3.936986,Unknown status,Observational,No,No,Sponsor,600
...,...,...,...,...,...,...,...
5014,2.336986,Completed,Interventional,No,No,Principal Investigator,80
5015,6.005479,Not yet recruiting,Interventional,No,No,Sponsor-Investigator,180
5016,3.750685,"Active, not recruiting",Interventional,Yes,No,Sponsor,38
5017,6.000000,Recruiting,Interventional,No,No,Sponsor-Investigator,80


In [8]:
duration_avr = df['actual_duration'].mean()
print(duration_avr)

3.3464529197675352


In [9]:
# Remove "Unknown" ("recruiting", "not yet recruiting", "Active, not recruiting", "Enrolling by invitation"?) status
unknown_stat = df['status'] != 'Unknown status'
df = df.loc[unknown_stat]

# Take out all other extras status
# not_yet_recruiting_stat = df['status'] != 'Not yet recruiting'
# df = df.loc[not_yet_recruiting_stat]

# active_not_recruiting_stat = df['status'] != 'Active, not recruiting'
# df = df.loc[active_not_recruiting_stat]

# enrolling_by_invitation_stat = df['status'] != 'Enrolling by invitation' 
# df = df.loc[enrolling_by_invitation_stat]

# recruiting_stat = df['status'] != 'Recruiting'
# df = df.loc[recruiting_stat]

df


Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,7.638356,Recruiting,Observational,No,No,Sponsor,3000
1,1.002740,Not yet recruiting,Observational,No,No,Principal Investigator,2300
3,4.835616,Recruiting,Observational,No,No,Sponsor,30
6,2.671233,Recruiting,Observational,No,No,Principal Investigator,271
7,2.712329,Recruiting,Observational,No,No,Sponsor,316
...,...,...,...,...,...,...,...
5014,2.336986,Completed,Interventional,No,No,Principal Investigator,80
5015,6.005479,Not yet recruiting,Interventional,No,No,Sponsor-Investigator,180
5016,3.750685,"Active, not recruiting",Interventional,Yes,No,Sponsor,38
5017,6.000000,Recruiting,Interventional,No,No,Sponsor-Investigator,80


In [10]:
# Convert the target Column values to low_chance and high_chance based in their values
# x = {'Completed' : 'high_chance'}
# df = df.replace(x)

x = dict.fromkeys(['Completed','Recruiting', 'Enrolling by invitation'], 'high_chance')
df = df.replace(x)

x = dict.fromkeys(['Withdrawn', 'Terminated', 'Suspended', 'Not yet recruiting', 'Active, not recruiting'], 'low_chance')
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,actual_duration,status,study_type,fda_regulated_drug,fda_regulated_device,responsible_party,enrollment_count
0,7.638356,high_chance,Observational,No,No,Sponsor,3000
1,1.00274,low_chance,Observational,No,No,Principal Investigator,2300
2,4.835616,high_chance,Observational,No,No,Sponsor,30
3,2.671233,high_chance,Observational,No,No,Principal Investigator,271
4,2.712329,high_chance,Observational,No,No,Sponsor,316


### Split the Data into Training and Testing

In [11]:
# Create our features
X = pd.get_dummies(df, columns=['study_type', 'fda_regulated_drug', 'fda_regulated_device', 
                                'responsible_party'], dtype=int).drop('status', axis=1)

# Create our target
y = df['status']
X.head()

Unnamed: 0,actual_duration,enrollment_count,study_type_Interventional,study_type_Observational,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes,responsible_party_Principal Investigator,responsible_party_Sponsor,responsible_party_Sponsor-Investigator
0,7.638356,3000,0,1,1,0,1,0,0,1,0
1,1.00274,2300,0,1,1,0,1,0,1,0,0
2,4.835616,30,0,1,1,0,1,0,0,1,0
3,2.671233,271,0,1,1,0,1,0,1,0,0
4,2.712329,316,0,1,1,0,1,0,0,1,0


In [12]:
# Check count, mean, max for features
X.describe()

Unnamed: 0,actual_duration,enrollment_count,study_type_Interventional,study_type_Observational,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes,responsible_party_Principal Investigator,responsible_party_Sponsor,responsible_party_Sponsor-Investigator
count,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0
mean,3.484832,5270.109,0.801554,0.198446,0.753163,0.246837,0.944062,0.055938,0.345172,0.608657,0.046171
std,2.799562,210877.2,0.398874,0.398874,0.431219,0.431219,0.229827,0.229827,0.475477,0.488105,0.209878
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.668493,36.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,2.99726,84.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,4.580822,225.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,50.786301,10000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Check the balance of our target values
y.value_counts()

status
high_chance    3091
low_chance     1414
Name: count, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

Counter({'high_chance': 2318, 'low_chance': 1060})
Counter({'high_chance': 773, 'low_chance': 354})


### Random Forest Classifier

In [15]:
# Create random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fit the model
rf_model = rf_model.fit(X_train, y_train)

In [16]:
# Create prediction
y_pred = rf_model.predict(X_test)

In [17]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,642,131
Actual Low chance,243,111


In [18]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Random Forest Classifier Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Classifier Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,642,131
Actual Low chance,243,111


Accuracy Score: 0.6681455190771961
Balanced Accuracy Score: 0.5720448615344136
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.73      0.83      0.31      0.77      0.51      0.27       773
 low_chance       0.46      0.31      0.83      0.37      0.51      0.25       354

avg / total       0.64      0.67      0.48      0.65      0.51      0.27      1127



### Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

In [20]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5873805921605602

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], 
                     columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,467,306
Actual Low chance,152,202


In [22]:
# Calculate accuracy score
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Balanced Random Forest Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Classifier

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,467,306
Actual Low chance,152,202


Accuracy Score: 0.6681455190771961
Balanced Accuracy Score: 0.5873805921605602
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.75      0.60      0.57      0.67      0.59      0.35       773
 low_chance       0.40      0.57      0.60      0.47      0.59      0.34       354

avg / total       0.64      0.59      0.58      0.61      0.59      0.35      1127



In [23]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(classifier.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

actual_duration: (0.49547351744757284)
enrollment_count: (0.46086406609730457)
responsible_party_Sponsor: (0.00626956878089979)
responsible_party_Principal Investigator: (0.005654048778727081)
study_type_Observational: (0.005217625329602904)
study_type_Interventional: (0.005131899559677409)
fda_regulated_drug_Yes: (0.004992522467148797)
responsible_party_Sponsor-Investigator: (0.004216768337141882)
fda_regulated_drug_No: (0.004177228170171322)
fda_regulated_device_No: (0.004123806672999064)
fda_regulated_device_Yes: (0.0038789483587542387)


### Logistic Regression

In [24]:
# Declare model
lr_model = LogisticRegression(solver='lbfgs', random_state=42)

# fit the model
lr_model = lr_model.fit(X_train, y_train)

In [25]:
# create prediction
y_pred = lr_model.predict(X_test)

In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], 
                     columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,773,0
Actual Low chance,354,0


In [27]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Logistic Regression Model Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Logistic Regression Model Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,773,0
Actual Low chance,354,0


Accuracy Score: 0.6858917480035492
Balanced Accuracy Score: 0.5
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.69      1.00      0.00      0.81      0.00      0.00       773
 low_chance       0.00      0.00      1.00      0.00      0.00      0.00       354

avg / total       0.47      0.69      0.31      0.56      0.00      0.00      1127



### Optimize model, Use resampling techniques to determine the best performing algorithm

### Oversampling

##### Naive Random Oversampling

In [28]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
# Instantiate the model
ros = RandomOverSampler(random_state=1)
# Resample the targets
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_chance': 2318, 'high_chance': 2318})

In [29]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit model
classifier.fit(X_resampled, y_resampled)

In [30]:
# Calculated the predictions
y_pred = classifier.predict(X_test)

In [31]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5127118644067796

In [32]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[773,   0],
       [345,   9]], dtype=int64)

In [33]:
# Create dataframe for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,345,9


In [34]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Naive Random OverSampling Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random OverSampling Results

Confusion Matrix


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,345,9


Accuracy Score: 0.6938775510204082
Balanced Accuracy Score: 0.5127118644067796
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.69      1.00      0.03      0.82      0.16      0.03       773
 low_chance       1.00      0.03      1.00      0.05      0.16      0.02       354

avg / total       0.79      0.69      0.33      0.58      0.16      0.03      1127



##### SMOTE Oversampling

In [35]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

# Instantiate the model
smote = SMOTE(random_state=1)

# Resample the targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'low_chance': 2318, 'high_chance': 2318})

In [36]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

In [37]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.501412429378531

In [38]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create dataframe confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,353,1


In [39]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("SMOTE Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Results

Confusion Matrix


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,353,1


Accuracy Score: 0.686779059449867
Balanced Accuracy Score: 0.501412429378531
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.69      1.00      0.00      0.81      0.05      0.00       773
 low_chance       1.00      0.00      1.00      0.01      0.05      0.00       354

avg / total       0.78      0.69      0.32      0.56      0.05      0.00      1127



### Undersampling

In [40]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

# Instantiate the model
Cluster_resample = ClusterCentroids(random_state=1)
X_resampled, y_resampled = Cluster_resample.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_chance': 1060, 'low_chance': 1060})

In [41]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

In [42]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5051253097112285

In [43]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create Dataframe from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,211,562
Actual low_risk,93,261


In [44]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Cluster Centroids Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Results

Confusion Matrix


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,211,562
Actual low_risk,93,261


Accuracy Score: 0.41881100266193433
Balanced Accuracy Score: 0.5051253097112285
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.69      0.27      0.74      0.39      0.45      0.19       773
 low_chance       0.32      0.74      0.27      0.44      0.45      0.21       354

avg / total       0.58      0.42      0.59      0.41      0.45      0.20      1127



### Combination (Over and Under) Sampling

In [45]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_chance': 742, 'low_chance': 1010})

In [46]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

In [47]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5522598870056498

In [48]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,317,37


In [49]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("SMOTEENN Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balanced Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Results

Confusion Matrix


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,773,0
Actual low_risk,317,37


Accuracy Score: 0.7187222715173026
Balanced Accuracy Score: 0.5522598870056498
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

high_chance       0.71      1.00      0.10      0.83      0.32      0.11       773
 low_chance       1.00      0.10      1.00      0.19      0.32      0.10       354

avg / total       0.80      0.72      0.39      0.63      0.32      0.11      1127

