In [1]:
## Reference: scikit-learn RandomForestClassifier

import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
feature_df = pd.read_csv('./slide_level_features/CRC_nucleus_morphology.csv')
label_df = pd.read_csv('./slide_level_features/CRC_AS_labels.csv')
concat_df = pd.concat([label_df, feature_df], axis=1)
concat_df = concat_df.dropna(axis=0, subset=['AS'])
concat_df.reset_index(drop=True, inplace=True)

In [3]:
X = concat_df.iloc[:, -30:] # morphologic features
concat_df['AS_label'] = np.where(concat_df["AS"]>10, 1,0)
y = concat_df['AS_label']

In [4]:
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

In [7]:
for n_iter in range(1,6):
    train_idx = np.load("./MLP/5_fold_split/AS_train_idx_fold_{}.npy".format(n_iter))
    test_idx = np.load("./MLP/5_fold_split/AS_test_idx_fold_{}.npy".format(n_iter))

    print(f'--------------------{n_iter} KFold-------------------')
    print(f'train_idx_len : {len(train_idx)} / test_idx_len : {len(test_idx)}')

    X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    grid_search = GridSearchCV(estimator=rf,
                            param_grid=params,
                            cv = 4,
                            n_jobs=-1, verbose=1, scoring="roc_auc")
    grid_search.fit(X_train, y_train)
    rf_best = grid_search.best_estimator_
    rf_best.fit(X_train, y_train)
    predicted = rf_best.predict(X_test)
    pred_proba = rf_best.predict_proba(X_test)
    accuracy = accuracy_score(y_test, predicted)
    print(f'Mean accuracy score: {accuracy:.3}')
    print(f'AUC:{roc_auc_score(y_test, pred_proba[:,1])}')
    if n_iter==1:
        feat_imp = rf_best.feature_importances_
    else:
        feat_imp = feat_imp + rf_best.feature_importances_

--------------------1 KFold-------------------
train_idx_len : 252 / test_idx_len : 63
Fitting 4 folds for each of 180 candidates, totalling 720 fits
Mean accuracy score: 0.698
AUC:0.7297297297297297
--------------------2 KFold-------------------
train_idx_len : 252 / test_idx_len : 63
Fitting 4 folds for each of 180 candidates, totalling 720 fits
Mean accuracy score: 0.619
AUC:0.6881496881496881
--------------------3 KFold-------------------
train_idx_len : 252 / test_idx_len : 63
Fitting 4 folds for each of 180 candidates, totalling 720 fits
Mean accuracy score: 0.683
AUC:0.7453222453222453
--------------------4 KFold-------------------
train_idx_len : 252 / test_idx_len : 63
Fitting 4 folds for each of 180 candidates, totalling 720 fits
Mean accuracy score: 0.571
AUC:0.6728395061728395
--------------------5 KFold-------------------
train_idx_len : 252 / test_idx_len : 63
Fitting 4 folds for each of 180 candidates, totalling 720 fits
Mean accuracy score: 0.651
AUC:0.7011316872427983


In [8]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": feat_imp /5
})
imp_df.sort_values(by="Imp", ascending=False)[:10]

Unnamed: 0,Varname,Imp
15,area_sd,0.145959
17,minor_axis_length_sd,0.107691
16,major_axis_length_sd,0.076712
18,perimeter_sd,0.069113
21,solidity_sd,0.050306
24,int_s_mean_sd,0.045316
4,circularity_mean,0.041925
1,major_axis_length_mean,0.041678
23,int_gray_sd_sd,0.036138
10,int_s_sd_mean,0.033942
