In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd

In [4]:
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from imblearn.metrics import classification_report_imbalanced

### Import CSV and Perform Data Cleaning

In [5]:
# imprt CSV as DataFrame
obs_df = pd.read_csv('Tables/observational_studies_fda.csv')

In [6]:
# Drop 'target_duration' column due to too many null values
# Drop null rows

obs_df = obs_df.drop(['target_duration'], axis=1)
obs_df = obs_df.dropna()
obs_df.head()

Unnamed: 0,ID,title,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,Recruiting,Observational,Case-Only,Non-Probability Sample,No,No
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,Not yet recruiting,Observational,Case-Control,Non-Probability Sample,No,No
2,Breast cancer,Role of Sorcin and Annexin A3 in Breast Cancer...,Unknown status,Observational,Case-Control,Probability Sample,No,No
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,Recruiting,Observational,Case-Control,Non-Probability Sample,No,No
4,241391,A Study to Identify Breast Cancer (IDBC),Unknown status,Observational,Cohort,Non-Probability Sample,No,No


In [7]:
# Check distribution of each status
obs_df.groupby('status').size()

status
Active, not recruiting      73
Completed                  186
Enrolling by invitation     22
Not yet recruiting         127
Recruiting                 444
Suspended                    7
Terminated                  16
Unknown status             127
Withdrawn                   16
dtype: int64

In [8]:
# Drop rows with 'unknown status'
obs_df = obs_df.loc[obs_df['status'] != 'Unknown status']

# categorize the status
replace_value = {
    'Completed':'Completed',
    'Active, not recruiting': 'Not Completed',
    'Enrolling by invitation': 'Not Completed',
    'Not yet recruiting': 'Not Completed',
    'Recruiting': 'Not Completed',
    'Suspended': 'Not Completed',
    'Terminated': 'Not Completed',
    'Withdrawn': 'Not Completed'
} 

obs_df = obs_df.replace({"status": replace_value})
obs_df.head()


Unnamed: 0,ID,title,status,study_type,observational_study_model,sampling_method,fda_regulated_drug,fda_regulated_device
0,BTX-BCI-016-PRT,Breast Cancer Index (BCI) Registry,Not Completed,Observational,Case-Only,Non-Probability Sample,No,No
1,2018-TJ-BCD,Diagnosis Value of SEMA4C in Breast Cancer,Not Completed,Observational,Case-Control,Non-Probability Sample,No,No
3,BC-BOMET,Evaluation of Prognostic Factors: From Breast ...,Not Completed,Observational,Case-Control,Non-Probability Sample,No,No
6,FH-Risk 2.0 Research Protocol,FH-Risk 2.0: Updating Breast Cancer Risk Estim...,Not Completed,Observational,Other,Non-Probability Sample,No,No
7,ID-RPSBC-01-20201012,Genetic and Non-Genetic Breast Cancer Risk Pre...,Not Completed,Observational,Case-Control,Probability Sample,No,No


In [9]:
# Create DF for machine learning - removes unnecessary columns
obs_ml_df = obs_df[['status', 'observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device']]

In [10]:
# Encode all columns into numbers to run through machine learning
obs_ml_df['status_num'] = LabelEncoder().fit_transform(obs_ml_df['status'])
obs_ml_df = pd.get_dummies(obs_ml_df,columns=['observational_study_model', 'sampling_method', 'fda_regulated_drug', 'fda_regulated_device'])
obs_ml_df = obs_ml_df.drop(columns=['status'])
obs_ml_df.head()

Unnamed: 0,status_num,observational_study_model_Case-Control,observational_study_model_Case-Crossover,observational_study_model_Case-Only,observational_study_model_Cohort,observational_study_model_Ecologic or Community,observational_study_model_Family-Based,observational_study_model_Other,sampling_method_Non-Probability Sample,sampling_method_Probability Sample,fda_regulated_drug_No,fda_regulated_drug_Yes,fda_regulated_device_No,fda_regulated_device_Yes
0,1,0,0,1,0,0,0,0,1,0,1,0,1,0
1,1,1,0,0,0,0,0,0,1,0,1,0,1,0
3,1,1,0,0,0,0,0,0,1,0,1,0,1,0
6,1,0,0,0,0,0,0,1,1,0,1,0,1,0
7,1,1,0,0,0,0,0,0,0,1,1,0,1,0


In [11]:
X = obs_ml_df.drop("status_num", axis=1)
y = obs_ml_df.status_num

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler with training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Random Forest Classifier

In [14]:
# Create random forest classifier
rf_model = RandomForestClassifier(n_estimators=256, random_state=1)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Create prediction
predictions = rf_model.predict(X_test_scaled)

In [16]:
# confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create DatafRame from confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,54
Actual 1,3,164


In [17]:
# Calculate accruacy score
acc_score = accuracy_score(y_test, predictions)

# Print results
print("Confusion Matrix")
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print("---------------------------")
print("Classfication Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,54
Actual 1,3,164


Accuracy Score: 0.7443946188340808
---------------------------
Classfication Report
              precision    recall  f1-score   support

           0       0.40      0.04      0.07        56
           1       0.75      0.98      0.85       167

    accuracy                           0.74       223
   macro avg       0.58      0.51      0.46       223
weighted avg       0.66      0.74      0.65       223



In [18]:
# Calculate feature importance
importances = rf_model.feature_importances_

# sort features by importance
sorted(zip(importances, X.columns), reverse=True)

[(0.13173759112144764, 'observational_study_model_Other'),
 (0.11723673625906497, 'observational_study_model_Case-Control'),
 (0.11270866149093796, 'observational_study_model_Cohort'),
 (0.09822871992765794, 'observational_study_model_Case-Only'),
 (0.09444103729011138, 'sampling_method_Non-Probability Sample'),
 (0.08952203499746288, 'fda_regulated_device_No'),
 (0.08820631341313158, 'sampling_method_Probability Sample'),
 (0.08353403231241407, 'fda_regulated_device_Yes'),
 (0.06613867804241386, 'observational_study_model_Ecologic or Community'),
 (0.05332721402150441, 'observational_study_model_Case-Crossover'),
 (0.029524407821011827, 'fda_regulated_drug_No'),
 (0.028275129604475976, 'fda_regulated_drug_Yes'),
 (0.007119443698365486, 'observational_study_model_Family-Based')]

### Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=256, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=256, random_state=1)

In [20]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.538227117194183

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual Completed', 'Actual Not Complted'], columns = ['Predicted Completed', 'Predicted Not Completed'])
cm_df

Unnamed: 0,Predicted Not Completed,Predicted Completed
Actual Not Completed,13,43
Actual Complted,26,141


In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.33      0.23      0.84      0.27      0.44      0.18        56
          1       0.77      0.84      0.23      0.80      0.44      0.21       167

avg / total       0.66      0.69      0.39      0.67      0.44      0.20       223



In [23]:
# Calculate feature importance
importances = rf_model.feature_importances_


# List the features sorted in descending order by feature importance
features_rank = sorted(zip(importances, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

observational_study_model_Other: (0.13173759112144764)
observational_study_model_Case-Control: (0.11723673625906497)
observational_study_model_Cohort: (0.11270866149093796)
observational_study_model_Case-Only: (0.09822871992765794)
sampling_method_Non-Probability Sample: (0.09444103729011138)
fda_regulated_device_No: (0.08952203499746288)
sampling_method_Probability Sample: (0.08820631341313158)
fda_regulated_device_Yes: (0.08353403231241407)
observational_study_model_Ecologic or Community: (0.06613867804241386)
observational_study_model_Case-Crossover: (0.05332721402150441)
fda_regulated_drug_No: (0.029524407821011827)
fda_regulated_drug_Yes: (0.028275129604475976)
observational_study_model_Family-Based: (0.007119443698365486)


### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

#### Undersampling

#### Combination (Over and Under) Sampling