<a href="https://colab.research.google.com/github/Chansikan/Python-tutorial/blob/master/Meningioma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## preparation

In [139]:
# Import modules
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LassoCV
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score

# set path
srcDir = '/content/drive/My Drive/ResearchProjects/[2020Sep-]Meningioma'

In [134]:
df = pd.read_csv(os.path.join(srcDir, 'radiomics_bin32.csv'))

T1_idx = df.columns.str.startswith('T1C_')
T2_idx = df.columns.str.startswith('T2_')
Total_idx = T1_idx | T2_idx

y = df['label']
X_T1 = df.loc[:, T1_idx]
X_T2 = df.loc[:, T2_idx]
X = df.loc[:, Total_idx]
print('T1 shape (row, col):', X_T1.shape)
print('T2 shape (row, col):', X_T2.shape)
print('Total shape (row, col):', X.shape)
print('Label: total no.,', len(y), '; Class 1,', round(np.mean(y)*100, 1), '%')

T1 shape (row, col): (258, 93)
T2 shape (row, col): (258, 93)
Total shape (row, col): (258, 186)
Label: total no., 258 ; Class 1, 36.8 %


In [135]:
# scaling
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X, y),
                        columns=X.columns)

In [141]:
# feature selection through bootstrapping
for cnt, rs in enumerate(tqdm(range(1000))):
  np.random.seed(rs)
  idx = np.random.choice(list(range(len(X_scaled))), len(X_scaled), replace=True)
  X_sampled = X_scaled.iloc[idx, :]
  y_sampled = y[idx]

  lsvc = LinearSVC(penalty="l1", max_iter=50000, dual=False)
  model = SelectFromModel(lsvc)
  model.fit(X_sampled, y_sampled)
  sel = model.get_support().astype('int8')
  row = pd.DataFrame(sel, index=X.columns)
  if cnt == 0:
    result_df = row
  else:
    result_df = pd.concat([result_df, row], axis=1)

100%|██████████| 1000/1000 [20:52<00:00,  1.25s/it]


In [142]:
n_rep = result_df.shape[1]

#( / n_rep).sort_values(ascending=False)
ff_df = pd.DataFrame({'n_selected': result_df.sum(axis=1),
                      'n_resampling': n_rep, 
                      'frequency': (result_df.sum(axis=1)/n_rep)}
                     ).sort_values(by='frequency', ascending=False)
ff_df.to_csv(os.path.join(srcDir, 'freq.csv'))

In [57]:
lsvc.score(X_test, y_test)

In [19]:
>>> 
>>> from sklearn.datasets import load_iris
>>> 
>>> X, y = load_iris(return_X_y=True)
>>> X.shape
(150, 4)
>>> lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
>>> model = SelectFromModel(lsvc, prefit=True)
>>> X_new = model.transform(X)
>>> X_new.shape
(150, 3)

36.8