# Multi-omics Enabled Sample Mislabeling Correction Challenge

This notebook is using various classifiers in an attempt to detect sample misclassifications

Details about this challenge: https://precision.fda.gov/challenges

## Solution

Import libraries

In [38]:
import os
import sys
import getopt
import re
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

Load data

In [56]:
labels = pd.read_csv("challenge_data/train_cli_corrected.tsv", sep="\t", index_col="sample")
prot = pd.read_csv("challenge_data/train_pro_corrected.tsv", sep="\t")
rna = pd.read_csv("challenge_data/train_rna_corrected.tsv", sep="\t")
# Transpose proteins matrix
prot = prot.T
rna = rna.T
# Misclassification labels are not needed anymore, the data has been manually corrected according
# to the misclassification labels
#misClassified = pd.read_csv("challenge_data/sum_tab_1.csv", sep=",")
#misClassified_2 = pd.read_csv("challenge_data/sum_tab_2.csv", sep=",")


# What if missing values are not really missing values but missing genes (Y chrom for instance)
prot = prot.fillna(0)
rna = rna.fillna(0)

Delete the missing training rows

In [57]:
prot = prot.drop("Training_2")
rna = rna.drop("Training_61")
labels_prot = labels.drop("Training_2")
labels_rna = labels.drop("Training_61")

## Explore Parameters

Now, it's time to figure out the best parameters for each model

In [64]:
gender_prot = labels_prot.loc[:,"gender"]
msi_prot = labels_prot.loc[:,"msi"]
gender_rna = labels_rna.loc[:,"gender"]
msi_rna = labels_rna.loc[:,"msi"]

In [65]:
def bestParams(x, y, clf, grid):
    grid_search = GridSearchCV(clf, param_grid=grid, cv=10, iid=False)
    grid_search.fit(x, y)
    print("Tuned params:", grid_search.best_params_)
    print("Tuned best acc:", grid_search.best_score_)

### Random Forest

In [66]:
grid = {
    "n_estimators": range(25, 75, 5),
    "max_depth": range(40, 80, 5)
}

print("Best params for gender")
bestParams(prot, gender_prot, RandomForestClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi_prot, RandomForestClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender_rna, RandomForestClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi_rna, RandomForestClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 45, 'max_depth': 50})
('Tuned best acc:', 0.7376984126984126)
Best params for msi
('Tuned params:', {'n_estimators': 40, 'max_depth': 75})
('Tuned best acc:', 0.8763888888888889)
Best params for gender
('Tuned params:', {'n_estimators': 25, 'max_depth': 55})
('Tuned best acc:', 0.869047619047619)
Best params for msi
('Tuned params:', {'n_estimators': 50, 'max_depth': 40})
('Tuned best acc:', 0.9625)


### SVM

In [67]:
grid = {
    "C": [pow(10,i) for i in range(-2,4)],
    "kernel": ["linear", "rbf"],
    "gamma": ["auto", "scale"]
}
print("Best params for gender")
bestParams(prot, gender_prot, SVC(), grid)
print("Best params for msi")
bestParams(prot, msi_prot, SVC(), grid)

print("Best params for gender")
bestParams(rna, gender_rna, SVC(), grid)
print("Best params for msi")
bestParams(rna, msi_rna, SVC(), grid)

Best params for gender
('Tuned params:', {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'})
('Tuned best acc:', 0.685515873015873)
Best params for msi
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.8763888888888889)
Best params for gender
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.8876984126984129)
Best params for msi
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.9875)


### KNN

In [68]:
grid = {
    "n_neighbors": range(1,20)
}
print("Best params for gender")
bestParams(prot, gender_prot, KNeighborsClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi_prot, KNeighborsClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender_rna, KNeighborsClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi_rna, KNeighborsClassifier(), grid)

Best params for gender
('Tuned params:', {'n_neighbors': 11})
('Tuned best acc:', 0.7140873015873016)
Best params for msi
('Tuned params:', {'n_neighbors': 10})
('Tuned best acc:', 0.8496031746031747)
Best params for gender
('Tuned params:', {'n_neighbors': 4})
('Tuned best acc:', 0.7253968253968253)
Best params for msi
('Tuned params:', {'n_neighbors': 3})
('Tuned best acc:', 0.9625)


### ADA Boost

In [69]:
grid = {
    "n_estimators": range(40, 60, 5)
}
print("Best params for gender")
bestParams(prot, gender_prot, AdaBoostClassifier(), grid)
print("Best params for msi")
bestParams(prot, msi_prot, AdaBoostClassifier(), grid)

print("Best params for gender")
bestParams(rna, gender_rna, AdaBoostClassifier(), grid)
print("Best params for msi")
bestParams(rna, msi_rna, AdaBoostClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 45})
('Tuned best acc:', 0.9388888888888889)
Best params for msi
('Tuned params:', {'n_estimators': 40})
('Tuned best acc:', 0.8759920634920635)
Best params for gender
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.9224206349206348)
Best params for msi
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.9)


## Final Classification

Train all classifiers with best parameters (discovered by grid search) and run them on the test set.
Each time, use a different subset (80%) to give each model slightly different data to reduce overfitting

In [70]:
def finalPredict(x_train, y_train, x_test, y_test, clf):
    x_train_subset, a, y_train_subset, b = train_test_split(x_train, y_train, test_size=0.2, shuffle=True)
    clf.fit(x_train_subset, y_train_subset)
    y_predict = clf.predict(x_test)
    print("Test accuracy:", accuracy_score(y_test, y_predict))
    return y_predict

Predict with all for proteins

In [99]:
gender = gender_prot
msi = msi_prot

# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
proteins_test = pd.read_csv("challenge_data/test_pro.tsv", sep="\t")
proteins_test = proteins_test.T
proteins_test = proteins_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]

predictions_prot =  [None] * 8

predictions_prot[0] = finalPredict(prot, gender, proteins_test, gender_test, RandomForestClassifier(n_estimators = 45, max_depth = 50))
predictions_prot[4] = finalPredict(prot, msi, proteins_test, msi_test, RandomForestClassifier(n_estimators = 40, max_depth = 75))

predictions_prot[1] = finalPredict(prot, gender, proteins_test, gender_test, SVC(C=10, kernel="rbf", gamma="scale"))
predictions_prot[5] = finalPredict(prot, msi, proteins_test, msi_test, SVC(C=0.01, kernel="linear", gamma="auto"))

predictions_prot[2] = finalPredict(prot, gender, proteins_test, gender_test, KNeighborsClassifier(n_neighbors=11))
predictions_prot[6] = finalPredict(prot, msi, proteins_test, msi_test, KNeighborsClassifier(n_neighbors=10))

predictions_prot[3] = finalPredict(prot, gender, proteins_test, gender_test, AdaBoostClassifier(n_estimators = 45))
predictions_prot[7] = finalPredict(prot, msi, proteins_test, msi_test, AdaBoostClassifier(n_estimators = 40))

('Test accuracy:', 0.4375)
('Test accuracy:', 0.825)
('Test accuracy:', 0.3375)
('Test accuracy:', 0.8625)
('Test accuracy:', 0.3625)
('Test accuracy:', 0.75)
('Test accuracy:', 0.8)
('Test accuracy:', 0.8125)


Predict with all for rna

In [105]:
gender = gender_rna
msi = msi_rna

# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
rna_test = pd.read_csv("challenge_data/test_rna.tsv", sep="\t")
rna_test = rna_test.T
rna_test = rna_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]

predictions_rna =  [None] * 8

predictions_rna[0] = finalPredict(rna, gender, rna_test, gender_test, RandomForestClassifier(n_estimators = 25, max_depth = 55))
predictions_rna[4] = finalPredict(rna, msi, rna_test, msi_test, RandomForestClassifier(n_estimators = 50, max_depth = 40))

predictions_rna[1] = finalPredict(rna, gender, rna_test, gender_test, SVC(C=0.01, kernel="linear", gamma="auto"))
predictions_rna[5] = finalPredict(rna, msi, rna_test, msi_test, SVC(C=0.01, kernel="linear", gamma="auto"))

predictions_rna[2] = finalPredict(rna, gender, rna_test, gender_test, KNeighborsClassifier(n_neighbors=4))
predictions_rna[6] = finalPredict(rna, msi, rna_test, msi_test, KNeighborsClassifier(n_neighbors=3))

predictions_rna[3] = finalPredict(rna, gender, rna_test, gender_test, AdaBoostClassifier(n_estimators = 50))
predictions_rna[7] = finalPredict(rna, msi, rna_test, msi_test, AdaBoostClassifier(n_estimators = 50))

('Test accuracy:', 0.4375)
('Test accuracy:', 0.9)
('Test accuracy:', 0.6875)
('Test accuracy:', 0.9625)
('Test accuracy:', 0.425)
('Test accuracy:', 0.9)
('Test accuracy:', 0.85)
('Test accuracy:', 0.8875)


Save separate predictions for rna and proteins, with a probability of mislabeling

In [106]:
def writeSingleOutput(predictions, extenstiveName):


    extensiveOut = open(extenstiveName,"w+")
    names = list(labels_test.index)

    extensiveOut.write("sample,Test_gender,Test_msi, RandomForest_gender,SVM_gender,KNN_gender,ADA_gender,RandomForest_msi,SVM_msi,KNN_msi,ADA_msi,gender_mislabeled_prob,msi_mislabeld_prob,mislabeled_prob\n")

    for i in range(len(predictions[0])):
        gender_mislabeled = 0
        msi_mislabeled = 0

        for j in range(0,4):
            if (predictions[j][i] != labels_test.iloc[i,0]):
                gender_mislabeled += 1

        for j in range(4,8):
            if (predictions[j][i] != labels_test.iloc[i,1]):
                msi_mislabeled += 1

        msi_mislabeled /= 4.0
        gender_mislabeled /= 4.0

        mislabeled = gender_mislabeled
        if (msi_mislabeled > mislabeled):
            mislabeled = msi_mislabeled

        extensiveOut.write(names[i] + "," + labels_test.iloc[i,0] + "," + labels_test.iloc[i,1])
        for j in range(len(predictions)):
            extensiveOut.write("," + predictions[j][i])
        extensiveOut.write(", " + str(gender_mislabeled) + "," + str(msi_mislabeled) + "," + str(mislabeled))
        extensiveOut.write("\n")

    extensiveOut.close()


In [107]:
writeSingleOutput(predictions_prot, "extensive_prot.csv")

In [108]:
writeSingleOutput(predictions_rna, "extensive_rna.csv")