# Multi-omics Enabled Sample Mislabeling Correction Challenge

This notebook is using various classifiers in an attempt to detect sample misclassifications

Details about this challenge: https://precision.fda.gov/challenges

## Solution

Import libraries

In [1]:
import os
import sys
import getopt
import re
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

Load data

In [37]:
labels = pd.read_csv("challenge_data/train_cli_corrected.tsv", sep="\t", index_col="sample")
prot = pd.read_csv("challenge_data/train_pro_corrected.tsv", sep="\t")
rna = pd.read_csv("challenge_data/train_rna_corrected.tsv", sep="\t")
# Transpose proteins matrix
prot = prot.T
rna = rna.T
# Misclassification labels are not needed anymore, the data has been manually corrected according
# to the misclassification labels
#misClassified = pd.read_csv("challenge_data/sum_tab_1.csv", sep=",")
#misClassified_2 = pd.read_csv("challenge_data/sum_tab_2.csv", sep=",")


# What if missing values are not really missing values but missing genes (Y chrom for instance)
prot = prot.fillna(0)
rna = rna.fillna(0)

Select only rows which were correctly classified (matches) for machine learning

## Explore Parameters

Now, it's time to figure out the best parameters for each model

In [68]:
gender = labels.loc[:,"gender"]
msi = labels.loc[:,"msi"]

In [69]:
def bestParams(x, y, clf, grid):
    grid_search = GridSearchCV(clf, param_grid=grid, cv=10, iid=False)
    grid_search.fit(x, y)
    print("Tuned params:", grid_search.best_params_)
    print("Tuned best acc:", grid_search.best_score_)

### Random Forest

In [71]:
grid = {
    "n_estimators": range(25, 75, 5),
    "max_depth": range(40, 80, 5)
}

print("Best params for gender")
bestParams(prot, gender, RandomForestClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi, RandomForestClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender, RandomForestClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi, RandomForestClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 60, 'max_depth': 70})
('Tuned best acc:', 0.7101190476190476)
Best params for msi
('Tuned params:', {'n_estimators': 40, 'max_depth': 45})
('Tuned best acc:', 0.8523809523809524)
Best params for gender
('Tuned params:', {'n_estimators': 70, 'max_depth': 60})
('Tuned best acc:', 0.8376984126984126)
Best params for msi
('Tuned params:', {'n_estimators': 30, 'max_depth': 75})
('Tuned best acc:', 0.9625)


### SVM

In [72]:
grid = {
    "C": [pow(10,i) for i in range(-2,4)],
    "kernel": ["linear", "rbf"],
    "gamma": ["auto", "scale"]
}
print("Best params for gender")
bestParams(prot, gender, SVC(), grid)
print("Best params for msi")
bestParams(prot, msi, SVC(), grid)

print("Best params for gender")
bestParams(rna, gender, SVC(), grid)
print("Best params for msi")
bestParams(rna, msi, SVC(), grid)

Best params for gender
('Tuned params:', {'kernel': 'rbf', 'C': 10, 'gamma': 'auto'})
('Tuned best acc:', 0.7132936507936508)
Best params for msi
('Tuned params:', {'kernel': 'rbf', 'C': 10, 'gamma': 'auto'})
('Tuned best acc:', 0.8492063492063492)
Best params for gender
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.8787698412698413)
Best params for msi
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.975)


### KNN

In [73]:
grid = {
    "n_neighbors": range(1,20)
}
print("Best params for gender")
bestParams(prot, gender, KNeighborsClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi, KNeighborsClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender, KNeighborsClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi, KNeighborsClassifier(), grid)

Best params for gender
('Tuned params:', {'n_neighbors': 2})
('Tuned best acc:', 0.7351190476190477)
Best params for msi
('Tuned params:', {'n_neighbors': 11})
('Tuned best acc:', 0.8541666666666666)
Best params for gender
('Tuned params:', {'n_neighbors': 14})
('Tuned best acc:', 0.6944444444444444)
Best params for msi
('Tuned params:', {'n_neighbors': 3})
('Tuned best acc:', 0.95)


### ADA Boost

In [74]:
grid = {
    "n_estimators": range(40, 60, 5)
}
print("Best params for gender")
bestParams(prot, gender, AdaBoostClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi, AdaBoostClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender, AdaBoostClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi, AdaBoostClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 40})
('Tuned best acc:', 0.8376984126984126)
Best params for msi
('Tuned params:', {'n_estimators': 40})
('Tuned best acc:', 0.8492063492063492)
Best params for gender
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.8777777777777779)
Best params for msi
('Tuned params:', {'n_estimators': 40})
('Tuned best acc:', 0.8875)


## Final Classification

Train all classifiers with best parameters (discovered by grid search) and run them on the test set.
Each time, use a different subset (80%) to give each model slightly different data to reduce overfitting

In [113]:
def finalPredict(x_train, y_train, x_test, y_test, clf):
    x_train_subset, a, y_train_subset, b = train_test_split(x_train, y_train, test_size=0.2, shuffle=True)
    clf.fit(x_train_subset, y_train_subset)
    y_predict = clf.predict(x_test)
    print("Test accuracy:", accuracy_score(y_test, y_predict))
    return y_predict

Predict with all for proteins

In [162]:
# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
proteins_test = pd.read_csv("challenge_data/test_pro.tsv", sep="\t")
proteins_test = proteins_test.T
proteins_test = proteins_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]

predictions_prot =  [None] * 8

predictions_prot[0] = finalPredict(prot, gender, proteins_test, gender_test, RandomForestClassifier(n_estimators = 60, max_depth = 70))
predictions_prot[4] = finalPredict(prot, msi, proteins_test, msi_test, RandomForestClassifier(n_estimators = 40, max_depth = 45))

predictions_prot[1] = finalPredict(prot, gender, proteins_test, gender_test, SVC(C=10, kernel="rbf", gamma="auto"))
predictions_prot[5] = finalPredict(prot, msi, proteins_test, msi_test, SVC(C=10, kernel="linear", gamma="auto"))

predictions_prot[2] = finalPredict(prot, gender, proteins_test, gender_test, KNeighborsClassifier(n_neighbors=2))
predictions_prot[6] = finalPredict(prot, msi, proteins_test, msi_test, KNeighborsClassifier(n_neighbors=11))

predictions_prot[3] = finalPredict(prot, gender, proteins_test, gender_test, AdaBoostClassifier(n_estimators = 40))
predictions_prot[7] = finalPredict(prot, msi, proteins_test, msi_test, AdaBoostClassifier(n_estimators = 40))

('Test accuracy:', 0.3875)
('Test accuracy:', 0.8375)
('Test accuracy:', 0.35)
('Test accuracy:', 0.85)
('Test accuracy:', 0.3875)
('Test accuracy:', 0.7875)
('Test accuracy:', 0.725)
('Test accuracy:', 0.8375)


Predict with all for rna

In [155]:
# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
rna_test = pd.read_csv("challenge_data/test_rna.tsv", sep="\t")
rna_test = rna_test.T
rna_test = rna_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]

predictions_rna =  [None] * 8

predictions_rna[0] = finalPredict(rna, gender, rna_test, gender_test, RandomForestClassifier(n_estimators = 70, max_depth = 60))
predictions_rna[4] = finalPredict(rna, msi, rna_test, msi_test, RandomForestClassifier(n_estimators = 30, max_depth = 75))

predictions_rna[1] = finalPredict(rna, gender, rna_test, gender_test, SVC(C=0.01, kernel="linear", gamma="auto"))
predictions_rna[5] = finalPredict(rna, msi, rna_test, msi_test, SVC(C=0.01, kernel="linear", gamma="auto"))

predictions_rna[2] = finalPredict(rna, gender, rna_test, gender_test, KNeighborsClassifier(n_neighbors=14))
predictions_rna[6] = finalPredict(rna, msi, rna_test, msi_test, KNeighborsClassifier(n_neighbors=3))

predictions_rna[3] = finalPredict(rna, gender, rna_test, gender_test, AdaBoostClassifier(n_estimators = 50))
predictions_rna[7] = finalPredict(rna, msi, rna_test, msi_test, AdaBoostClassifier(n_estimators = 40))

('Test accuracy:', 0.4625)
('Test accuracy:', 0.875)
('Test accuracy:', 0.6375)
('Test accuracy:', 0.95)
('Test accuracy:', 0.5)
('Test accuracy:', 0.925)
('Test accuracy:', 0.725)
('Test accuracy:', 0.9125)


Save separate predictions for rna and proteins, with a probability of mislabeling

In [156]:
def writeSingleOutput(predictions, extenstiveName):


    extensiveOut = open(extenstiveName,"w+")
    names = list(labels_test.index)

    extensiveOut.write("sample,Test_gender,Test_msi, RandomForest_gender,SVM_gender,KNN_gender,ADA_gender,RandomForest_msi,SVM_msi,KNN_msi,ADA_msi,gender_mislabeled_prob,msi_mislabeld_prob,mislabeled_prob\n")

    for i in range(len(predictions[0])):
        gender_mislabeled = 0
        msi_mislabeled = 0

        for j in range(0,4):
            if (predictions[j][i] != labels_test.iloc[i,0]):
                gender_mislabeled += 1

        for j in range(4,8):
            if (predictions[j][i] != labels_test.iloc[i,1]):
                msi_mislabeled += 1

        msi_mislabeled /= 4.0
        gender_mislabeled /= 4.0

        mislabeled = gender_mislabeled
        if (msi_mislabeled > mislabeled):
            mislabeled = msi_mislabeled

        extensiveOut.write(names[i] + "," + labels_test.iloc[i,0] + "," + labels_test.iloc[i,1])
        for j in range(len(predictions)):
            extensiveOut.write("," + predictions[j][i])
        extensiveOut.write(", " + str(gender_mislabeled) + "," + str(msi_mislabeled) + "," + str(mislabeled))
        extensiveOut.write("\n")

    extensiveOut.close()


In [163]:
writeSingleOutput(predictions_prot, "extensive_prot.csv")

In [161]:
writeSingleOutput(predictions_rna, "extensive_rna.csv")