In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids


### Import CSV and Perform Data Cleaning

In [4]:
# imprt CSV as DataFrame
obs_df = pd.read_csv('../Tables/updated_obs_studies.csv')

In [5]:
# Drop 'target_duration' column due to too many null values
# Drop null rows

obs_df = obs_df.drop(['target_duration'], axis=1)
obs_df = obs_df.dropna()
obs_df.head()

Unnamed: 0,ID,title,actual_duration,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device,enrollment_count
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,2788 days,Recruiting,Observational,Case-Only,Non-Probability Sample,No,No,3000
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,366 days,Not yet recruiting,Observational,Case-Control,Non-Probability Sample,No,No,2300
2,Breast cancer,Role of Sorcin and Annexin A3 in Breast Cancer...,253 days,Unknown status,Observational,Case-Control,Probability Sample,No,No,80
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,1765 days,Recruiting,Observational,Case-Control,Non-Probability Sample,No,No,30
4,241391,A Study to Identify Breast Cancer (IDBC),1437 days,Unknown status,Observational,Cohort,Non-Probability Sample,No,No,600


In [6]:
# Check distribution of each status
obs_df.groupby('status').size()

status
Active, not recruiting      73
Completed                  186
Enrolling by invitation     22
Not yet recruiting         127
Recruiting                 444
Suspended                    7
Terminated                  16
Unknown status             127
Withdrawn                   16
dtype: int64

In [7]:
# Drop rows with 'unknown status'
obs_df = obs_df.loc[obs_df['status'] != 'Unknown status']

# categorize the status
replace_value = {
    'Completed':'high_chance',
    'Active, not recruiting': 'low_chance',
    'Enrolling by invitation': 'high_chance',
    'Not yet recruiting': 'low_chance',
    'Recruiting': 'high_chance',
    'Suspended': 'low_chance',
    'Terminated': 'low_chance',
    'Withdrawn': 'low_chance'
} 

obs_df = obs_df.replace({"status": replace_value})
obs_df.head()


Unnamed: 0,ID,title,actual_duration,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device,enrollment_count
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,2788 days,high_chance,Observational,Case-Only,Non-Probability Sample,No,No,3000
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,366 days,low_chance,Observational,Case-Control,Non-Probability Sample,No,No,2300
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,1765 days,high_chance,Observational,Case-Control,Non-Probability Sample,No,No,30
6,FH-Risk 2.0 Research Protocol,FH-Risk 2.0: Updating Breast Cancer Risk Estim...,975 days,high_chance,Observational,Other,Non-Probability Sample,No,No,271
7,ID-RPSBC-01-20201012,Genetic and Non-Genetic Breast Cancer Risk Pre...,990 days,high_chance,Observational,Case-Control,Probability Sample,No,No,316


In [8]:
# Create DF for machine learning - removes unnecessary columns
obs_ml_df = obs_df[['status', 'enrollment_count', 'actual_duration', 'observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device', ]]

In [9]:
# Encode all columns into numbers to run through machine learning
obs_ml_df['status'] = LabelEncoder().fit_transform(obs_ml_df['status'])
obs_ml_df = pd.get_dummies(obs_ml_df,columns=['observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device'])
obs_ml_df.head()

Unnamed: 0,status,enrollment_count,actual_duration,observational_study_model_Case-Control,observational_study_model_Case-Crossover,observational_study_model_Case-Only,observational_study_model_Cohort,observational_study_model_Ecologic or Community,observational_study_model_Family-Based,observational_study_model_Other,sampling_method_Non-Probability Sample,sampling_method_Probability Sample,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
0,0,3000,2788 days,0,0,1,0,0,0,0,1,0,1,0,1,0
1,1,2300,366 days,1,0,0,0,0,0,0,1,0,1,0,1,0
3,0,30,1765 days,1,0,0,0,0,0,0,1,0,1,0,1,0
6,0,271,975 days,0,0,0,0,0,0,1,1,0,1,0,1,0
7,0,316,990 days,1,0,0,0,0,0,0,0,1,1,0,1,0


In [10]:
###### TO BE REMOVED ONCE DATA IS UPDATED

obs_ml_df['actual_duration'] = pd.to_numeric(obs_ml_df['actual_duration'].astype(str).str.replace('days', ''), errors='coerce')

In [11]:
X = obs_ml_df.drop("status", axis=1)
y = obs_ml_df.status

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler with training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Random Forest Classifier

In [14]:
# Create random forest classifier
rf_model = RandomForestClassifier(n_estimators=256, random_state=1)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Create prediction
y_pred = rf_model.predict(X_test_scaled)

In [16]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,152,24
Actual Low chance,40,7


In [17]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,152,24
Actual Low chance,40,7


Accuracy Score: 0.7130044843049327
Balaned Accuracy Score: 0.5062862669245648
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.86      0.15      0.83      0.36      0.14       176
          1       0.23      0.15      0.86      0.18      0.36      0.12        47

avg / total       0.67      0.71      0.30      0.69      0.36      0.13       223



In [18]:
# Calculate feature importance
importances = rf_model.feature_importances_

# sort features by importance
sorted(zip(importances, X.columns), reverse=True)

[(0.4680974298004842, 'actual_duration'),
 (0.44667151135489436, 'enrollment_count'),
 (0.012773816587621679, 'observational_study_model_Cohort'),
 (0.009906160855485025, 'observational_study_model_Case-Control'),
 (0.0094393235152842, 'sampling_method_Non-Probability Sample'),
 (0.00907781263334816, 'sampling_method_Probability Sample'),
 (0.008838474262911277, 'observational_study_model_Case-Only'),
 (0.007932712781807016, 'observational_study_model_Other'),
 (0.006121346468076523, 'fda_regulated_device_Yes'),
 (0.005374103700612015, 'fda_regulated_device_No'),
 (0.0050356943625501065, 'fda_regulated_drug_No'),
 (0.00461309267040879, 'fda_regulated_drug_Yes'),
 (0.004365275464086825, 'observational_study_model_Case-Crossover'),
 (0.0017532455424299543, 'observational_study_model_Ecologic or Community'),
 (0.0, 'observational_study_model_Family-Based')]

### Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=256, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=256, random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49329061895551257

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,95,81
Actual Low chance,26,21


In [22]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,95,81
Actual Low chance,26,21


Accuracy Score: 0.5201793721973094
Balaned Accuracy Score: 0.49329061895551257
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.54      0.45      0.64      0.49      0.24       176
          1       0.21      0.45      0.54      0.28      0.49      0.24        47

avg / total       0.66      0.52      0.47      0.56      0.49      0.24       223



In [23]:
# Calculate feature importance
importances = rf_model.feature_importances_


# List the features sorted in descending order by feature importance
features_rank = sorted(zip(importances, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

actual_duration: (0.4680974298004842)
enrollment_count: (0.44667151135489436)
observational_study_model_Cohort: (0.012773816587621679)
observational_study_model_Case-Control: (0.009906160855485025)
sampling_method_Non-Probability Sample: (0.0094393235152842)
sampling_method_Probability Sample: (0.00907781263334816)
observational_study_model_Case-Only: (0.008838474262911277)
observational_study_model_Other: (0.007932712781807016)
fda_regulated_device_Yes: (0.006121346468076523)
fda_regulated_device_No: (0.005374103700612015)
fda_regulated_drug_No: (0.0050356943625501065)
fda_regulated_drug_Yes: (0.00461309267040879)
observational_study_model_Case-Crossover: (0.004365275464086825)
observational_study_model_Ecologic or Community: (0.0017532455424299543)
observational_study_model_Family-Based: (0.0)


### Logistic Regression

In [24]:
# Declare model
lr_model = LogisticRegression(solver='lbfgs', random_state=42)

# fit the model
lr_model = lr_model.fit(X_train_scaled, y_train)

In [25]:
# create prediction
y_pred = lr_model.predict(X_test_scaled)

In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


In [27]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


Accuracy Score: 0.7892376681614349
Balaned Accuracy Score: 0.5
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      1.00      0.00      0.88      0.00      0.00       176
          1       0.00      0.00      1.00      0.00      0.00      0.00        47

avg / total       0.62      0.79      0.21      0.70      0.00      0.00       223



### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

- Random Over Sampler

In [28]:
# Resample with RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [29]:
# Train through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# fit model
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [30]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [31]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


In [32]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


Accuracy Score: 0.7892376681614349
Balaned Accuracy Score: 0.5
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      1.00      0.00      0.88      0.00      0.00       176
          1       0.00      0.00      1.00      0.00      0.00      0.00        47

avg / total       0.62      0.79      0.21      0.70      0.00      0.00       223



- SMOTE

In [33]:
# initate SMOTE model
smote = SMOTE(random_state=1)

# Resample targets
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [34]:
# train through resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [35]:
# calculate predicions
y_pred = classifier.predict(X_test)

In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


In [37]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,176,0
Actual Low chance,47,0


Accuracy Score: 0.7892376681614349
Balaned Accuracy Score: 0.5
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      1.00      0.00      0.88      0.00      0.00       176
          1       0.00      0.00      1.00      0.00      0.00      0.00        47

avg / total       0.62      0.79      0.21      0.70      0.00      0.00       223



#### Undersampling

In [38]:
# Resample the data using the ClusterCentroids resampler
Cluster_resample = ClusterCentroids(random_state=1)
X_resampled, y_resampled = Cluster_resample.fit_resample(X_train, y_train)

In [39]:
# Train & fit the model through logsitc regression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [40]:
# Calculate predictions and the balanced accuracy score
y_pred = classifier.predict(X_test)

In [41]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,19,157
Actual Low chance,6,41


In [42]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,19,157
Actual Low chance,6,41


Accuracy Score: 0.26905829596412556
Balaned Accuracy Score: 0.4901474854932302
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.11      0.87      0.19      0.31      0.09       176
          1       0.21      0.87      0.11      0.33      0.31      0.10        47

avg / total       0.64      0.27      0.71      0.22      0.31      0.09       223



#### Combination (Over and Under) Sampling

In [43]:
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

In [44]:
classifier.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [45]:
# calculate prediction & balanced accuracy score
y_pred = classifier.predict(X_test)

In [46]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual High Chance', 'Actual Low chance'], columns = ['Predicted High Chance', 'Predicted Low Chance'])
cm_df

Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,160,16
Actual Low chance,42,5


In [47]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)
bal_acc_score = balanced_accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(f'Balaned Accuracy Score: {bal_acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted High Chance,Predicted Low Chance
Actual High Chance,160,16
Actual Low chance,42,5


Accuracy Score: 0.7399103139013453
Balaned Accuracy Score: 0.5077369439071566
---------------------------
Classfication Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.91      0.11      0.85      0.31      0.10       176
          1       0.24      0.11      0.91      0.15      0.31      0.09        47

avg / total       0.68      0.74      0.28      0.70      0.31      0.10       223

