In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

### 1.   Prepare input data
-----

In [None]:
# Function: determine PHA-L read cut-offs for binary classification 
def categorize_lectin(data_all, quantile_high, quantile_low, ref_col_loc):
    cutoff = np.quantile(data_all.iloc[:,ref_col_loc], [quantile_high, quantile_low], interpolation="nearest").tolist()
    print(f"Cut-off for PHA-L high: {cutoff[0]}; Cut-off for PHA-L low: {cutoff[1]}")
    
    high_indices = np.array(data_all.iloc[:,ref_col_loc]>=cutoff[0])
    low_indices = np.array(data_all.iloc[:,ref_col_loc]<cutoff[1])
    high_low_indices = np.logical_or(high_indices, low_indices)

    high_count = high_indices.sum()
    low_count = low_indices.sum()
    
    return cutoff, [high_indices, low_indices, high_low_indices], [high_count, low_count]

In [None]:
# Load input file
input_df = pd.read_csv('TIL_transformed_data.csv')

In [None]:
# Process data: binary classification
quantile_high, quantile_low = 0.75, 0.25
cutoff, indices, count = categorize_lectin(input_df, quantile_high, quantile_low, -1)

input_df.loc[indices[0], "PHA-L"] = 1
input_df.loc[indices[1], "PHA-L"] = 0

input_df = input_df.loc[indices[2], :]

In [None]:
#y: class array
y = input_df['PHA-L'].values 
#X: transcript data array
X = input_df.iloc[:, 1:-1].values

In [None]:
# Split training, validation and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, random_state=342, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=2, stratify=y_train_val)

### 2.   Model training
-----

In [None]:
# Parameters for grid search

# Number of trees in random forest
n_estimators = [int(x) for x in np.arange(100, 600, step=100)]
# Maximum number of levels in tree
learning_rate = [0.1, 1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate
               }

In [None]:
# Use RandomSearchCV to optimize hyperparameters
model = AdaBoostClassifier()

model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=5, random_state=42, n_jobs = -1)

model_random.fit(X_train, y_train)

In [None]:
# Return the best estimator
model = model_random.best_estimator_
model.get_params()

In [None]:
def model_evaluation(model, X, y):
    print(f"Accuracy for 'PHA-L high' class: {100*(model.score(X[y==1], y[y==1])):>4f}%")
    print(f"Accuracy for 'PHA-L low' class: {100*(model.score(X[y==0], y[y==0])):>4f}%")
    print(f"Overall accuracy: {100*(model.score(X, y)):>4f}%")

    model_predict = model.predict(X)
    model_predict_prob = model.predict_proba(X)

    print(f"Average loss: {log_loss(y, model_predict_prob):>4f}")
    print(f"ROC Curve AUC: {roc_auc_score(y, model_predict):>4f}")
    print(f"F1 score: {f1_score(y, model_predict):>4f}")

In [None]:
model_evaluation(model, X_train, y_train)

In [None]:
model_evaluation(model, X_val, y_val)