<a href="https://colab.research.google.com/github/Fjoru/TM10007_PROJECT/blob/Carlijn/assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TM10007 Assignment Prediction of tumor grade in brain cancer
By Jessica Barends, Gonnie van Erp, Erik Kemper en Carlijn Oerlemans

In [0]:
# Run install for use in colab environment
!pip install --upgrade pip
!pip install -q --upgrade git+https://github.com/Fjoru/TM10007_PROJECT
!pip install ipdb -q
!pip install seaborn
!pip install tensorflow

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/54/0c/d01aa759fdc501a58f431eb594a17495f15b88da142ce14b5845662c13f3/pip-20.0.2-py2.py3-none-any.whl (1.4MB)
[K     |▎                               | 10kB 22.8MB/s eta 0:00:01[K     |▌                               | 20kB 30.0MB/s eta 0:00:01[K     |▊                               | 30kB 34.1MB/s eta 0:00:01[K     |█                               | 40kB 35.7MB/s eta 0:00:01[K     |█▏                              | 51kB 38.5MB/s eta 0:00:01[K     |█▍                              | 61kB 40.8MB/s eta 0:00:01[K     |█▋                              | 71kB 42.1MB/s eta 0:00:01[K     |█▉                              | 81kB 42.3MB/s eta 0:00:01[K     |██                              | 92kB 43.3MB/s eta 0:00:01[K     |██▎                             | 102kB 44.4MB/s eta 0:00:01[K     |██▌                             | 112kB 44.4MB/s eta 0:00:01[K     |██▊                             | 122kB 44.4MB/

## Import section


In [0]:
import ipdb
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# import tensorflow as tf

# Preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Classifiers
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import svm
from sklearn import model_selection
from sklearn import metrics

  import pandas.util.testing as tm


## preprocessing

In [0]:
def Replace(i):
    if isinstance(i, int):
          return i
    try:
        float(i)
        return float(i)
    except:
        return np.nan

def preprocessing_steps(X_design, Y_design, X_test, Y_test):
    # training set
    # remove strings from data
    X_design = X_design.applymap(func=Replace)

    # set 0.0 as NaN
    X_design.replace(0, np.nan, inplace=True)

    # set Inf as NaN
    X_design.replace([np.inf, -np.inf], np.nan, inplace=True)

    # remove features with less than 60% values
    X_design = X_design.dropna(thresh=round(X_design.shape[0]*0.6), axis='columns')

    # remove sample with less than 60% values
    # join features and labels
    XY_design = X_design.join(Y_design)
    # remove samples with not enough values
    XY_design = XY_design.dropna(thresh=round(XY_design.shape[1]*0.6))

    # remove samples without a label
    XY_design['label'].replace(np.nan, '', inplace=True)
    XY_design = XY_design[XY_design['label'].astype(bool)]

    # split features (X_data) and labels (Y_data)
    X_design = XY_design.drop(columns=['label'])
    Y_design = XY_design[['label']]
    
    # add missing value's 
    imputer = IterativeImputer(sample_posterior=True, n_nearest_features=20, random_state=0)
    X_design_imputed = imputer.fit_transform(X_design)

    # normalization of values
    scaler = RobustScaler()
    X_design_scaled = scaler.fit_transform(X_design_imputed)

    # getting back to Dataframe 
    X_design = pd.DataFrame(X_design_scaled, columns=X_design.columns, index=X_design.index)

    ####### test set
    # remove strings from data
    X_test = X_test.applymap(func=Replace)

    # set 0.0 as NaN
    X_test.replace(0, np.nan, inplace=True)

    # set Inf as NaN
    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # remove the same features as the design set
    features_design = X_design.columns
    features_design = X_design.columns
    
    df_test = None
    for feature in features_design:
      df_test_single = pd.DataFrame(X_test[feature])
      if df_test is None:
        df_test = df_test_single
      else:
        df_test = df_test.join(df_test_single, how='outer')
    
    X_test = df_test
    
    # remove sample with less than 60% values
    # join features and labels
    XY_test = X_test.join(Y_test)
    # remove samples with not enough values
    XY_test = XY_test.dropna(thresh=round(XY_test.shape[1]*0.6))

    # remove samples without a label
    XY_test['label'].replace(np.nan, '', inplace=True)
    XY_test = XY_test[XY_test['label'].astype(bool)]

    # split features (X_test) and labels (Y_test)
    X_test = XY_test.drop(columns=['label'])
    Y_test = XY_test[['label']]
    
    # add missing value's 
    X_test_imputed = imputer.transform(X_test)

    # normalization of values
    X_test_scaled = scaler.transform(X_test_imputed)

    # getting back to Dataframe 
    X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

    return X_design, Y_design, X_test, Y_test

## Feature selection and extraction

In [0]:
def feature_steps(X_design, Y_design):

    impo_clf = ExtraTreesClassifier(n_estimators=50)
    impo_clf = impo_clf.fit(X_data, Y_data)
    impo_clf.feature_importances_

    importances = impo_clf.feature_importances_
    std = np.std([impo_clf.feature_importances_ for tree in impo_clf.estimators_], 
                 axis=0)
    indices = np.argsort(importances)[::-1]


    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X_data.shape[1]), importances[indices],
            color="r", yerr=std[indices], align="center")
    plt.xticks(range(X_data.shape[1]), indices)
    plt.xlim([-1, X_data.shape[1]])
    plt.show()

    model = SelectFromModel(impo_clf, prefit=True)
    X_data = model.transform(X_data)

    return X_data, Y_data

## Classification

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

h = .02
names = ["Decision Tree", "Random Forest", "Nearest Neighbors", "Linear SVM"]

classifiers = [
    DecisionTreeClassifier(1.0 * RBF(1.0)),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025), 
    SVC(gamma=2, C=1)]

X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

figure = plt.figure(figsize=(27, 9))
i = 1

# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_design, X_test, y_design, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_design[:, 0], X_design[:, 1], c=y_design, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

        # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_design, y_design)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_design[:, 0], X_design[:, 1], c=y_design, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

plt.tight_layout()
plt.show()

## Run pipeline

In [0]:
# Data loading functions.
from brats.load_data import load_data

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
data = pd.DataFrame(data)

# split labels and values
data_X = data.drop(columns=['label'])
data_Y = data[['label']]

# data split index forming
Test_split = model_selection.StratifiedKFold(n_splits=10)
results = list()
best_kernel = list()

i = 0
for design_index, test_index in Test_split.split(data_X, data_Y):
    if i:
      continue
    X_design = data_X.iloc[design_index]
    Y_design = data_Y.iloc[design_index]
    
    X_test = data_X.iloc[test_index]
    Y_test = data_Y.iloc[test_index]

    # run preprocessing step
    X_design, Y_design, X_test, Y_test = preprocessing_steps(X_design, Y_design, X_test, Y_test)
    print(X_design)
    
    #run feature selection and extraction
    X_design, Y_design = feature_steps(X_design, Y_design)

    i = 1
    
    ## Example for Classifier hyperparameters selecting
    clf = svm.SVC(probability=True)
    parameters = parameters = {
        "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}
    random_search = model_selection.RandomizedSearchCV(clf, parameters, scoring='roc_auc') ## hierin zit al de crossvalidatie, dus opnieuw een k-fold split hoeft niet #keuze om score voor alle classifiers gelijk te houden of per classifier te definieren
    random_search.fit(X_design, Y_design)

    # Get resulting classifier
    clf_best = random_search.best_estimator_
    print(f'Best classifier: kernel={clf_best.kernel}')
    best_kernel.append(clf_best.kernel)   #per fold best classifier will be appended

    # Test the classifier on the test data
    prob = clf_best.predict_proba(X_test)
    scores = prob[:, 1]

    # Gettin accuracy, AUC and f1-score
    accuracy = metrics.accuracy_score(Y_test, scores)
    auc = metrics.roc_auc_score(Y_test, scores)
    f1 = metrics.f1_score(Y_test, scores)
    results.append({
        'accuracy': accuracy,
        'AUC': auc,
        'f1-score': f1,
        'kernel': clf_best.kernel,
        'set': test
    })

    # Test the classifier on the training data
    prob_testing = clf.predict_proba(X_design)
    scores_training = probab_testing[:, 1]
    
    # Getting the accuracy, AUC and f1-score
    accuracy = metrics.accuracy_score(Y_design, scores_training)
    auc = metrics.roc_auc_score(Y_design, scores_training)
    f1 = metrics.f1_score(Y_design, scores_training)
    results.append({
        'accuracy': accuracy,
        'AUC': auc,
        'f1-score': f1,
        'kernel': clf_best.kernel,
        'set': training
    })

    # Create results dataframe and plot it
    results = pd.DataFrame(results)
    seaborn.boxplot(y='AUC', x='set', data=results)
    seaborn.boxplot(y='accuracy', x='set', data=results)
    seaborn.boxplot(y='f1-score', x='set', data=results)

    optimal_kernel = int(np.median(best_kernel))
    print(f"The optimal kernel={optimal_kernel}")

# save data to csv for manual check
#X_design.to_csv('data_X.csv')
#Y_design.to_csv('data_Y.csv')



The number of samples: 167
The number of columns: 725
              VOLUME_ET  VOLUME_NET  ...  TGM_Cog_Z_1   TGM_T_1
ID                                   ...                       
TCGA-02-0064   0.306650   -0.149335  ...    -0.409811  0.412749
TCGA-02-0068  -0.131777   -0.098523  ...    -0.254765 -0.182429
TCGA-02-0069  -0.006484    0.912666  ...     0.284368  2.705034
TCGA-02-0070  -0.274249   -0.314443  ...     0.839935 -0.449039
TCGA-02-0075  -0.109278    0.247846  ...     0.451606 -0.017942
...                 ...         ...  ...          ...       ...
TCGA-HT-8018  -0.506682   -0.179239  ...    -0.849962 -0.833459
TCGA-HT-8111  -0.512206   -0.381408  ...     1.026131 -0.804997
TCGA-HT-8114  -0.282298    3.740634  ...     0.526592 -0.757509
TCGA-HT-8563  -0.181188   -0.367314  ...    -0.466627 -0.801878
TCGA-HT-A61A   0.757504    1.082333  ...    -0.592295  0.330073

[150 rows x 698 columns]


UnboundLocalError: ignored