In [1]:
import os
import warnings

from skrebate import ReliefF
#from ReliefF import ReliefF

import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import Pipeline

In [None]:
def relieff(X_std_train, X_std_test, y_train,n_features, colNames, features):
    '''
    Feature selection using ReliefF.

    :param str X_std_train: Training data
    :param str X_std_test: Validation data
    :param str y_train: Response to the training data
    :param int n_features: Number of features to be selected
    :param colNames: List with the names of the columns/features
    :features: List that the selected features will be added to
    :return: The training data and validation data with only the selected features
             and the list with the features
    '''
    #relieff = ReliefF(n_features_to_select=n_features, n_neighbors=20)
    #relieff.fit(X_std_train,y_train)
    #importances = relieff.feature_importances_
    #indices = np.argsort(importances)[::-1]
    #feature_names = []

    #for f in range(X_std_train.shape[1]):
    #    feature_names.append(colNames[indices[f]])
    #print(feature_names[0:n_features])
    #X_std_train = X_std_train[:,indices[0:n_features]]
    #X_std_test = X_std_test[:,indices[0:n_features]]
    #features.append(feature_names[0:n_features])
    #return (X_std_train, X_std_test, features)
    return X_std_train, X_std_test, []

In [None]:
def plsr(X_std_train, y_train):
    '''
    Classification using Partial Least Squares.

    :param str X_std_train: Training data
    :param str y_train: Response to the training data
    :return: Classification model made from the training data
    '''
    param_range =  np.arange(1,X_std_train.shape[1]+1)
    feature_range = [0.0000001, 0.000001, 0.00001]
    #param_grid = {'n_components' : param_range, 'tol' : feature_range}
    
    pipe = Pipeline(
        [
            ('relieff', ReliefF()),
            ('plsr', PLSRegression(scale=False))
        ]
    )
    param_grid = {
        'relieff__n_features_to_select': [2],
        'relieff__n_neighbors': [20],
        'plsr__n_components': [2],
        'plsr__tol': [1e-7]
        
    }
    gs = GridSearchCV(estimator=pipe,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     cv=4)

    warnings.filterwarnings('ignore')
    model = gs.fit(X_std_train,y_train)
    print('Training score PLSR:', gs.best_score_)
    print(gs.best_params_)
    warnings.filterwarnings('default')
    return(model)

In [None]:
def klassifisering(input_excel, ark, y_navn, n_features):
    '''
    Classification using thirteen classifiers defined in the script "functions". 
    Uses 4-folds-CV ten times with different splits each times. Uses given number of
    features and chosen feature selector.
     
    :param str input_excel: The name of the excel-file with the dataset
    :param str ark: The name of the sheet with the dataset
    :param str y_navn: The name of the column with the response
    :param int n_features: Number of features to use in the models
    :return: Matrix with the AUC of all classificatons and matrix with the selected 
             features
    '''
    # Reads the excel-file
    xls = pd.ExcelFile(input_excel) 
    data_raw_df = pd.read_excel(xls, sheetname=ark, index_col = 0)
     
    # Creates the result-matrix
    results = [[],[],[],[],[],[],[],[],[],[]] 
    for i in range(0,10):
        results[i] = np.zeros((13,4))
 
    y_name = y_navn
    y = data_raw_df[y_name].values 
    X= data_raw_df.drop(y_name,1) 
    colNames = list(X.columns) 
    states =  [108, 355, 44, 129, 111, 362, 988, 266, 82,581]
    features = []
    stdsc = StandardScaler()
    
    # ERROR: Really only 10 reps (not 40 as stated in thesis).
    for k in range(0, 10): 
        i = 0
        state = states[k]
        cv = StratifiedKFold(n_splits=4, random_state = state, shuffle = True) 
        for train, test in cv.split(X, y):
            print(k,i)
            X_train = X.iloc[train]
            X_test = X.iloc[test]
            y_train = y[train]
            y_test = y[test]
            X_std_train = stdsc.fit_transform(X_train) 
            X_std_test = stdsc.transform(X_test) 
            # ERROR: Should be performed together with PLSR inside CV for unbiased 
            # experiments.
            X_std_train, X_std_test, features = relieff(
                X_std_train, X_std_test, y_train, n_features, colNames, features
            )
            model =  plsr(X_std_train, y_train)
            print('Test score PLSR:', model.score(X_std_test,y_test))
            results[k][11,i] = model.score(X_std_test,y_test)
             
    return (results, features)

In [None]:
# Insert the right names
input_excel = 'X_endelig_squareroot.xlsx'
ark = 'tilbakefall'
y_navn = 'Toklasser'

#n_features = 2
#results, features = klassifisering(input_excel, ark, y_navn, n_features)

In [None]:
outcome = np.array(results)[np.where(np.array(results) != 0)]
outcome

In [12]:
np.mean(outcome), np.median(outcome)

NameError: name 'outcome' is not defined

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler

In [5]:
xls = pd.ExcelFile('./benchdata.xlsx') 
data_raw_df = pd.read_excel(xls, sheet_name='tilbakefall', index_col=0)
y = data_raw_df['Toklasser'].values 
X = data_raw_df.drop('Toklasser', 1) 

pipe = Pipeline(
    [
        ('scaler1', StandardScaler()),
        ('relieff', ReliefF()),
        ('scaler2', StandardScaler()),
        ('plsr', PLSRegression(scale = False))
    ]
)
param_grid = {
    'relieff__n_features_to_select': [2],
    'relieff__n_neighbors': [20],
    'plsr__n_components': [1],
    'plsr__tol': [1e-7]

}

In [13]:
pd.concat((X.filter(regex='CT'), X.filter(regex='PET'), X.filter(regex='shape')), axis=1)

Unnamed: 0_level_0,CT_squareroot_firstorder_10Percentile,CT_squareroot_firstorder_90Percentile,CT_squareroot_firstorder_Energy,CT_squareroot_firstorder_Entropy,CT_squareroot_firstorder_InterquartileRange,CT_squareroot_firstorder_Kurtosis,CT_squareroot_firstorder_Maximum,CT_squareroot_firstorder_MeanAbsoluteDeviation,CT_squareroot_firstorder_Mean,CT_squareroot_firstorder_Median,...,original_shape_MajorAxis,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxis,original_shape_Sphericity,original_shape_SurfaceArea,original_shape_SurfaceVolumeRatio,original_shape_Volume
pasientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,38.535698,39.686270,26342955,0.952511,0.571055,72.729177,45,0.386454,39.192759,39.115214,...,37.380273,41.976184,44.598206,42.720019,45.617979,27.619612,0.661532,4860.055715,0.283517,17142
4,36.932371,40.348482,14515732,2.532300,1.694380,11.846025,44,1.200863,38.718653,39.242834,...,31.504408,38.587563,35.468296,29.410882,38.704005,25.105855,0.701721,3126.087371,0.323578,9661
5,41.952354,45.110974,30954330,2.217014,1.253694,23.055654,55,0.991986,43.660835,43.874822,...,42.073251,46.065171,43.011626,32.015621,46.454279,25.282894,0.762365,4063.633046,0.250625,16214
8,39.191836,40.398020,110377536,1.441888,0.598522,82.930611,48,0.667313,39.858131,40.398020,...,73.057649,74.148500,80.956779,65.764732,83.438600,57.318945,0.520001,15698.615155,0.226355,69354
10,37.669616,38.794329,15574987,0.936440,0.000000,56.085876,43,0.469348,38.048623,38.236109,...,36.035312,33.286634,38.013156,33.015148,43.150898,24.875896,0.643822,3658.448414,0.340352,10749
11,37.349699,39.686270,4291650,1.963147,0.579517,27.531411,45,0.850379,38.688114,39.115214,...,24.613845,25.238859,23.259407,23.345235,27.331301,20.714498,0.700650,1391.652644,0.486082,2863
13,38.105118,39.242834,61028000,1.402066,0.564675,56.324335,44,0.544174,38.549253,38.678159,...,57.888802,63.639610,53.907328,52.201533,66.136223,41.443948,0.514826,11173.332259,0.272361,41024
14,32.310989,35.496479,1851840,2.103693,1.029103,13.993486,36,1.339021,34.408588,34.985711,...,24.719926,24.186773,22.472205,13.152946,24.515301,11.283847,0.682894,951.719950,0.610860,1558
15,36.083237,38.340579,4648560,2.116539,1.679973,32.011939,42,0.978846,37.285225,37.788887,...,25.589900,23.600847,21.954498,28.653098,31.843367,21.693241,0.687683,1570.363368,0.470591,3337
16,34.322005,36.986484,6380694,2.041455,1.057303,22.640766,38,1.056661,35.613147,35.944402,...,34.684750,31.384710,37.013511,31.064449,39.217343,28.839789,0.506232,2799.979891,0.557987,5018


In [9]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='roc_auc',
    iid=True,
    cv=4
)
grid.fit(X.astype(float), y)



GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('relieff', ReliefF(discrete_threshold=10, n_features_to_select=10, n_jobs=1,
    n_neighbors=100, verbose=False)), ('scaler2', StandardScaler(copy=True, with_mean=True, with_std=True)), ('plsr', PLSRegression(copy=True, max_iter=500, n_components=2, scale=False, tol=1e-06))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'relieff__n_features_to_select': [2], 'relieff__n_neighbors': [20], 'plsr__n_components': [1], 'plsr__tol': [1e-07]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [10]:
scores = cross_val_score(grid, X.astype(float), y, scoring='roc_auc', cv=4)
scores



array([0.7       , 0.69852941, 0.58823529, 0.72635135])

In [13]:
np.mean(scores), np.median(scores)

(0.6782790143084261, 0.6992647058823529)