In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Import CSV and Perform Data Cleaning

In [None]:
columns = ["ID", "title", "study_type", "arm_group_type", "intervention_type",
          "interventional_type, model", "phase", "fda_regulated_drug", "fda_regulated_device"]

target = ["status"]

In [None]:
# Load the interventional_study_only
file_path = Path(Tables/'interventional_study_only.csv')
df = pd.read_csv(file_path)
df

In [None]:
# Examine DataFrame

# Extract and keep first values for arm_group_type and intervention_type

# Drop null row
#.dropna()

# Remove "Unknown" ("recruiting", "not yet recruiting", "active, not recruiting", "enrolling by invitation"?) status
unknow_stat = df['status'] != 'Unknown status'
df = df.loc[unknown_stat]

# Convert the target Column values to low_chance and high_chance based in their values
x = {'Completed' : 'high_chance'}
df = df.replace(x)

x - dict.fromkeys (['Withdrawn', 'Terminated', 'Suspended'], 'low_chance')
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

### Split the Data into Training and Testing

In [None]:
# Create our features
X = pd.get_dummies (df, columns=['arm_group_type', 'intervention_type', 'intervenal_type_model',
                                'phase', 'fda_regulated_drug', 'fda_regulated_device']).drop('status', axis=1)

# Create our target
y = df['status']
X.head()

In [None]:
# Check count, mean, max for features
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

### Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate model
classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fit model
classifier.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual low_chance", "Actual high_chance"], columns=["Predicted low_chance", "Predicted high_chance"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(classifier.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

### Optimize model, Use resampling techniques to determine the best performing algorithm

#### Oversampling

#### Undersampling

#### Combination (Over and Under) Sampling