In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids


### Import CSV and Perform Data Cleaning

In [4]:
# imprt CSV as DataFrame
obs_df = pd.read_csv('../Tables/updated_obs_studies.csv')

In [5]:
# Drop 'target_duration' column due to too many null values
# Drop null rows

obs_df = obs_df.drop(['target_duration'], axis=1)
obs_df = obs_df.dropna()
obs_df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device,enrollment_count
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,7.638356,Recruiting,Observational,Case-Only,Non-Probability Sample,No,No,3000
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,1.00274,Not yet recruiting,Observational,Case-Control,Non-Probability Sample,No,No,2300
2,Breast cancer,Role of Sorcin and Annexin A3 in Breast Cancer...,0.693151,Unknown status,Observational,Case-Control,Probability Sample,No,No,80
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,4.835616,Recruiting,Observational,Case-Control,Non-Probability Sample,No,No,30
4,241391,A Study to Identify Breast Cancer (IDBC),3.936986,Unknown status,Observational,Cohort,Non-Probability Sample,No,No,600


In [6]:
# Check distribution of each status
obs_df.groupby('status').size()

status
Active, not recruiting      75
Completed                  186
Enrolling by invitation     22
Not yet recruiting         126
Recruiting                 446
Suspended                    7
Terminated                  16
Unknown status             127
Withdrawn                   16
dtype: int64

In [7]:
# Drop rows with 'unknown status'
obs_df = obs_df.loc[obs_df['status'] != 'Unknown status']

# categorize the status
replace_value = {
    'Completed':'high_chance',
    'Active, not recruiting': 'low_chance',
    'Enrolling by invitation': 'high_chance',
    'Not yet recruiting': 'low_chance',
    'Recruiting': 'high_chance',
    'Suspended': 'low_chance',
    'Terminated': 'low_chance',
    'Withdrawn': 'low_chance'
} 

obs_df = obs_df.replace({"status": replace_value})
obs_df.head()


Unnamed: 0,ID,title,actual_duration,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device,enrollment_count
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,7.638356,high_chance,Observational,Case-Only,Non-Probability Sample,No,No,3000
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,1.00274,low_chance,Observational,Case-Control,Non-Probability Sample,No,No,2300
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,4.835616,high_chance,Observational,Case-Control,Non-Probability Sample,No,No,30
6,FH-Risk 2.0 Research Protocol,FH-Risk 2.0: Updating Breast Cancer Risk Estim...,2.671233,high_chance,Observational,Other,Non-Probability Sample,No,No,271
7,ID-RPSBC-01-20201012,Genetic and Non-Genetic Breast Cancer Risk Pre...,2.712329,high_chance,Observational,Case-Control,Probability Sample,No,No,316


In [8]:
# Create DF for machine learning - removes unnecessary columns
obs_ml_df = obs_df[['status', 'enrollment_count', 'actual_duration', 'observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device', ]]

In [9]:
# Encode all columns into numbers to run through machine learning
obs_ml_df['status'] = LabelEncoder().fit_transform(obs_ml_df['status'])
obs_ml_df = pd.get_dummies(obs_ml_df,columns=['observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device'])
obs_ml_df.head()

Unnamed: 0,status,enrollment_count,actual_duration,observational_study_model_Case-Control,observational_study_model_Case-Crossover,observational_study_model_Case-Only,observational_study_model_Cohort,observational_study_model_Ecologic or Community,observational_study_model_Family-Based,observational_study_model_Other,sampling_method_Non-Probability Sample,sampling_method_Probability Sample,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
0,0,3000,7.638356,0,0,1,0,0,0,0,1,0,1,0,1,0
1,1,2300,1.00274,1,0,0,0,0,0,0,1,0,1,0,1,0
3,0,30,4.835616,1,0,0,0,0,0,0,1,0,1,0,1,0
6,0,271,2.671233,0,0,0,0,0,0,1,1,0,1,0,1,0
7,0,316,2.712329,1,0,0,0,0,0,0,0,1,1,0,1,0


In [10]:
###### TO BE REMOVED ONCE DATA IS UPDATED

obs_ml_df['actual_duration'] = pd.to_numeric(obs_ml_df['actual_duration'].astype(str).str.replace('days', ''), errors='coerce')

In [11]:
X = obs_ml_df.drop("status", axis=1)
y = obs_ml_df.status

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler with training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Random Forest Classifier

In [14]:
# Create random forest classifier
rf_model = RandomForestClassifier(n_estimators=256, random_state=1)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Create prediction
y_pred = rf_model.predict(X_test_scaled)

In [16]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,148,22
Actual Low chance,38,16


In [18]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Random Forest Classifier Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Classifier Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,148,22
Actual Low chance,38,16


Accuracy Score: 0.7321428571428571
Balaned Accuracy Score: 0.583442265795207
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.87      0.30      0.83      0.51      0.27       170
          1       0.42      0.30      0.87      0.35      0.51      0.24        54

avg / total       0.71      0.73      0.43      0.71      0.51      0.27       224



In [19]:
# Calculate feature importance
importances = rf_model.feature_importances_

# sort features by importance
sorted(zip(importances, X.columns), reverse=True)

[(0.4681344902908242, 'actual_duration'),
 (0.4400525697525509, 'enrollment_count'),
 (0.011902048343715348, 'observational_study_model_Cohort'),
 (0.011274738340037753, 'sampling_method_Non-Probability Sample'),
 (0.010232426946997044, 'observational_study_model_Case-Control'),
 (0.009894184254706196, 'observational_study_model_Other'),
 (0.008942864520079407, 'sampling_method_Probability Sample'),
 (0.008363739362938055, 'observational_study_model_Case-Only'),
 (0.005958669580013565, 'fda_regulated_drug_No'),
 (0.005718676244156647, 'observational_study_model_Case-Crossover'),
 (0.005631067142487108, 'fda_regulated_drug_Yes'),
 (0.005481794395934248, 'fda_regulated_device_Yes'),
 (0.005013603106586491, 'fda_regulated_device_No'),
 (0.0024176150111368476, 'observational_study_model_Ecologic or Community'),
 (0.000981512707836105, 'observational_study_model_Family-Based')]

### Balanced Random Forest Classifier

In [20]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=256, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=256, random_state=1)

In [21]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5668845315904139

In [22]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,92,78
Actual Low chance,22,32


In [23]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Balanced Random Forest Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Classifier

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,92,78
Actual Low chance,22,32


Accuracy Score: 0.5535714285714286
Balaned Accuracy Score: 0.5668845315904139
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.54      0.59      0.65      0.57      0.32       170
          1       0.29      0.59      0.54      0.39      0.57      0.32        54

avg / total       0.68      0.55      0.58      0.59      0.57      0.32       224



In [24]:
# Calculate feature importance
importances = rf_model.feature_importances_


# List the features sorted in descending order by feature importance
features_rank = sorted(zip(importances, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

actual_duration: (0.4681344902908242)
enrollment_count: (0.4400525697525509)
observational_study_model_Cohort: (0.011902048343715348)
sampling_method_Non-Probability Sample: (0.011274738340037753)
observational_study_model_Case-Control: (0.010232426946997044)
observational_study_model_Other: (0.009894184254706196)
sampling_method_Probability Sample: (0.008942864520079407)
observational_study_model_Case-Only: (0.008363739362938055)
fda_regulated_drug_No: (0.005958669580013565)
observational_study_model_Case-Crossover: (0.005718676244156647)
fda_regulated_drug_Yes: (0.005631067142487108)
fda_regulated_device_Yes: (0.005481794395934248)
fda_regulated_device_No: (0.005013603106586491)
observational_study_model_Ecologic or Community: (0.0024176150111368476)
observational_study_model_Family-Based: (0.000981512707836105)


### Logistic Regression

In [25]:
# Declare model
lr_model = LogisticRegression(solver='lbfgs', random_state=42)

# fit the model
lr_model = lr_model.fit(X_train_scaled, y_train)

In [26]:
# create prediction
y_pred = lr_model.predict(X_test_scaled)

In [27]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,53,1


In [28]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Logistic Regression Model Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Logistic Regression Model Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,53,1


Accuracy Score: 0.7633928571428571
Balaned Accuracy Score: 0.5092592592592593
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      1.00      0.02      0.87      0.14      0.02       170
          1       1.00      0.02      1.00      0.04      0.14      0.02        54

avg / total       0.82      0.76      0.26      0.67      0.14      0.02       224



### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

- Random Over Sampler

In [29]:
# Resample with RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [30]:
# Train through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# fit model
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [31]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [32]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,47,7


In [33]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Random Over Sampler Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Over Sampler Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,47,7


Accuracy Score: 0.7901785714285714
Balaned Accuracy Score: 0.5648148148148148
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      1.00      0.13      0.88      0.36      0.14       170
          1       1.00      0.13      1.00      0.23      0.36      0.12        54

avg / total       0.84      0.79      0.34      0.72      0.36      0.14       224



- SMOTE

In [34]:
# initate SMOTE model
smote = SMOTE(random_state=1)

# Resample targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [35]:
# train through resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [36]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [37]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,50,4


In [38]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("SMOTE Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,50,4


Accuracy Score: 0.7767857142857143
Balaned Accuracy Score: 0.537037037037037
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.77      1.00      0.07      0.87      0.27      0.08       170
          1       1.00      0.07      1.00      0.14      0.27      0.07        54

avg / total       0.83      0.78      0.30      0.69      0.27      0.08       224



#### Undersampling

In [39]:
# Resample the data using the ClusterCentroids resampler
Cluster_resample = ClusterCentroids(random_state=1)
X_resampled, y_resampled = Cluster_resample.fit_resample(X_train, y_train)

In [40]:
# Train & fit the model through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [41]:
# Calculate predictions and the balanced accuracy score
y_pred = classifier.predict(X_test)

In [42]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,18,152
Actual Low chance,3,51


In [43]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Cluster Centroids Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Cluster Centroids Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,18,152
Actual Low chance,3,51


Accuracy Score: 0.3080357142857143
Balaned Accuracy Score: 0.5251633986928105
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.11      0.94      0.19      0.32      0.09       170
          1       0.25      0.94      0.11      0.40      0.32      0.11        54

avg / total       0.71      0.31      0.74      0.24      0.32      0.10       224



#### Combination (Over and Under) Sampling

In [44]:
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

In [45]:
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [46]:
# calculate prediction & balanced accuracy score
y_pred = classifier.predict(X_test)

In [47]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,47,7


In [49]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("SMOTEENN Results\n")
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

SMOTEENN Results

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,170,0
Actual Low chance,47,7


Accuracy Score: 0.7901785714285714
Balaned Accuracy Score: 0.5648148148148148
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      1.00      0.13      0.88      0.36      0.14       170
          1       1.00      0.13      1.00      0.23      0.36      0.12        54

avg / total       0.84      0.79      0.34      0.72      0.36      0.14       224

