# Multi-omics Enabled Sample Mislabeling Correction Challenge

This notebook is using various classifiers in an attempt to detect sample misclassifications

Details about this challenge: https://precision.fda.gov/challenges

## Solution Specifics
* Models in this solution are trained on first 40 principal components (which explain more than 90% of variance)
* N/As are replaced with zeros

**Accuracies are slightly worse on average compared to using all or significant features**

## Solution

Import libraries

In [1]:
import os
import sys
import getopt
import re
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

Load data

In [2]:
labels = pd.read_csv("challenge_data/train_cli_corrected.tsv", sep="\t", index_col="sample")
prot = pd.read_csv("challenge_data/train_pca_pro_corrected.tsv", sep="\t", index_col=0)
rna = pd.read_csv("challenge_data/train_pca_rna_corrected.tsv", sep="\t", index_col=0)
# Transpose proteins matrix
prot = prot.T
rna = rna.T
# Misclassification labels are not needed anymore, the data has been manually corrected according
# to the misclassification labels
#misClassified = pd.read_csv("challenge_data/sum_tab_1.csv", sep=",")
#misClassified_2 = pd.read_csv("challenge_data/sum_tab_2.csv", sep=",")


# What if missing values are not really missing values but missing genes (Y chrom for instance)
prot = prot.fillna(0)
rna = rna.fillna(0)

Delete the missing training rows

In [3]:
prot = prot.drop("Training_2")
rna = rna.drop("Training_61")
labels_prot = labels.drop("Training_2")
labels_rna = labels.drop("Training_61")

## Explore Parameters

Now, it's time to figure out the best parameters for each model

In [4]:
gender_prot = labels_prot.loc[:,"gender"]
msi_prot = labels_prot.loc[:,"msi"]
gender_rna = labels_rna.loc[:,"gender"]
msi_rna = labels_rna.loc[:,"msi"]

In [5]:
def bestParams(x, y, clf, grid):
    grid_search = GridSearchCV(clf, param_grid=grid, cv=10, iid=False)
    grid_search.fit(x, y)
    print("Tuned params:", grid_search.best_params_)
    print("Tuned best acc:", grid_search.best_score_)
    return grid_search.best_params_

### Random Forest

In [8]:
grid = {
    "n_estimators": range(25, 75, 5),
    "max_depth": range(40, 80, 5)
}

print("Best params for gender")
random_forest_prot_gender = bestParams(prot, gender_prot, RandomForestClassifier(), grid)
print("Best params for msi")
random_forest_prot_msi = bestParams(prot, msi_prot, RandomForestClassifier(), grid)

print("Best params for gender")
random_forest_rna_gender = bestParams(rna, gender_rna, RandomForestClassifier(), grid)
print("Best params for msi")
random_forest_rna_msi = bestParams(rna, msi_rna, RandomForestClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 55, 'max_depth': 70})
('Tuned best acc:', 0.7323412698412699)
Best params for msi
('Tuned params:', {'n_estimators': 70, 'max_depth': 45})
('Tuned best acc:', 0.8496031746031747)
Best params for gender
('Tuned params:', {'n_estimators': 45, 'max_depth': 70})
('Tuned best acc:', 0.798015873015873)
Best params for msi
('Tuned params:', {'n_estimators': 50, 'max_depth': 65})
('Tuned best acc:', 0.8732142857142857)


### SVM

In [9]:
grid = {
    "C": [pow(10,i) for i in range(-2,4)],
    "kernel": ["linear", "rbf"],
    "gamma": ["auto", "scale"]
}
print("Best params for gender")
svm_prot_gender = bestParams(prot, gender_prot, SVC(), grid)
print("Best params for msi")
svm_prot_msi = bestParams(prot, msi_prot, SVC(), grid)

print("Best params for gender")
svm_rna_gender = bestParams(rna, gender_rna, SVC(), grid)
print("Best params for msi")
svm_rna_msi =  bestParams(rna, msi_rna, SVC(), grid)

Best params for gender
('Tuned params:', {'kernel': 'linear', 'C': 0.01, 'gamma': 'auto'})
('Tuned best acc:', 0.6601190476190476)
Best params for msi
('Tuned params:', {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'})
('Tuned best acc:', 0.8638888888888889)
Best params for gender
('Tuned params:', {'kernel': 'rbf', 'C': 10, 'gamma': 'scale'})
('Tuned best acc:', 0.8845238095238095)
Best params for msi
('Tuned params:', {'kernel': 'linear', 'C': 100, 'gamma': 'auto'})
('Tuned best acc:', 0.9)


### KNN

In [10]:
grid = {
    "n_neighbors": range(1,20)
}
print("Best params for gender")
knn_prot_gender = bestParams(prot, gender_prot, KNeighborsClassifier(), grid)
print("Best params for msi")
knn_prot_msi = bestParams(prot, msi_prot, KNeighborsClassifier(), grid)

print("Best params for gender")
knn_rna_gender = bestParams(rna, gender_rna, KNeighborsClassifier(), grid)
print("Best params for msi")
knn_rna_msi = bestParams(rna, msi_rna, KNeighborsClassifier(), grid)

Best params for gender
('Tuned params:', {'n_neighbors': 3})
('Tuned best acc:', 0.7041666666666667)
Best params for msi
('Tuned params:', {'n_neighbors': 8})
('Tuned best acc:', 0.8388888888888889)
Best params for gender
('Tuned params:', {'n_neighbors': 2})
('Tuned best acc:', 0.756547619047619)
Best params for msi
('Tuned params:', {'n_neighbors': 3})
('Tuned best acc:', 0.8746031746031747)


### ADA Boost

In [11]:
grid = {
    "n_estimators": range(40, 60, 5)
}
print("Best params for gender")
ada_prot_gender = bestParams(prot, gender_prot, AdaBoostClassifier(), grid)
print("Best params for msi")
ada_prot_msi = bestParams(prot, msi_prot, AdaBoostClassifier(), grid)

print("Best params for gender")
ada_rna_gender = bestParams(rna, gender_rna, AdaBoostClassifier(), grid)
print("Best params for msi")
ada_rna_msi = bestParams(rna, msi_rna, AdaBoostClassifier(), grid)

Best params for gender
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.6037698412698412)
Best params for msi
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.7692460317460317)
Best params for gender
('Tuned params:', {'n_estimators': 50})
('Tuned best acc:', 0.7791666666666666)
Best params for msi
('Tuned params:', {'n_estimators': 55})
('Tuned best acc:', 0.8482142857142858)


## Final Classification

Train all classifiers with best parameters (discovered by grid search) and run them on the test set.
Each time, use a different subset (80%) to give each model slightly different data to reduce overfitting

In [18]:
def finalPredict(x_train, y_train, x_test, y_test, clf):
    x_train_subset, a, y_train_subset, b = train_test_split(x_train, y_train, test_size=0.2, shuffle=True)
    clf.fit(x_train_subset, y_train_subset)
    y_predict = clf.predict(x_test)
    print("Test accuracy:", accuracy_score(y_test, y_predict))
    return y_predict

In [19]:
def createPredictions(train_features, train_gender, train_msi, test_features, test_gender, test_msi, best_params):
    predictions =  [None] * 8

    predictions[0] = finalPredict(train_features, train_gender, test_features, test_gender, 
                                       RandomForestClassifier(n_estimators = best_params["random_forest_gender"]["n_estimators"],
                                                              max_depth = best_params["random_forest_gender"]["max_depth"]))
    predictions[4] = finalPredict(train_features, train_msi, test_features, test_msi, 
                                       RandomForestClassifier(n_estimators = best_params["random_forest_msi"]["n_estimators"],
                                                              max_depth = best_params["random_forest_msi"]["max_depth"]))

    predictions[1] = finalPredict(train_features, train_gender, test_features, test_gender, 
                                       SVC(C=best_params["svm_gender"]["C"], 
                                           kernel=best_params["svm_gender"]["kernel"], 
                                           gamma=best_params["svm_gender"]["gamma"]))
    predictions[5] = finalPredict(train_features, train_msi, test_features, test_msi, 
                                       SVC(C=best_params["svm_msi"]["C"], 
                                           kernel=best_params["svm_msi"]["kernel"], 
                                           gamma=best_params["svm_msi"]["gamma"]))

    predictions[2] = finalPredict(train_features, train_gender, test_features, test_gender, 
                                       KNeighborsClassifier(n_neighbors=best_params["knn_gender"]["n_neighbors"]))
    predictions[6] = finalPredict(train_features, train_msi, test_features, test_msi, 
                                       KNeighborsClassifier(n_neighbors=best_params["knn_msi"]["n_neighbors"]))

    predictions[3] = finalPredict(train_features, train_gender, test_features, test_gender, 
                                       AdaBoostClassifier(n_estimators = best_params["ada_gender"]["n_estimators"]))
    predictions[7] = finalPredict(train_features, train_msi, test_features, test_msi, 
                                       AdaBoostClassifier(n_estimators = best_params["ada_msi"]["n_estimators"]))
    
    return predictions

Protein models

In [20]:
# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
proteins_test = pd.read_csv("challenge_data/test_pca_pro.tsv", sep="\t", index_col=0)
proteins_test = proteins_test.T
proteins_test = proteins_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]
best_params = {"random_forest_gender": random_forest_prot_gender, "random_forest_msi": random_forest_prot_msi,
               "svm_gender": svm_prot_gender, "svm_msi": svm_prot_msi, "knn_gender": knn_prot_gender,
               "knn_msi": knn_prot_msi, "ada_gender": ada_prot_gender, "ada_msi": ada_prot_msi}

predictions_prot = createPredictions(prot, gender_prot, msi_prot, proteins_test, gender_test, msi_test, best_params)

('Test accuracy:', 0.4125)
('Test accuracy:', 0.7875)
('Test accuracy:', 0.3875)
('Test accuracy:', 0.775)
('Test accuracy:', 0.375)
('Test accuracy:', 0.825)
('Test accuracy:', 0.4875)
('Test accuracy:', 0.7375)


RNA models

In [21]:
# Load test data in the same way train data was loaded
labels_test = pd.read_csv("challenge_data/test_cli.tsv", sep="\t", index_col="sample")
rna_test = pd.read_csv("challenge_data/test_pca_rna.tsv", sep="\t", index_col=0)
rna_test = rna_test.T
rna_test = rna_test.fillna(0)

gender_test = labels_test.loc[:,"gender"]
msi_test = labels_test.loc[:,"msi"]
best_params = {"random_forest_gender": random_forest_rna_gender, "random_forest_msi": random_forest_rna_msi,
               "svm_gender": svm_rna_gender, "svm_msi": svm_rna_msi, "knn_gender": knn_rna_gender,
               "knn_msi": knn_rna_msi, "ada_gender": ada_rna_gender, "ada_msi": ada_rna_msi}

predictions_rna = createPredictions(rna, gender_rna, msi_rna, rna_test, gender_test, msi_test, best_params)

('Test accuracy:', 0.4)
('Test accuracy:', 0.8)
('Test accuracy:', 0.4625)
('Test accuracy:', 0.525)
('Test accuracy:', 0.4375)
('Test accuracy:', 0.7)
('Test accuracy:', 0.4875)
('Test accuracy:', 0.5875)


Save separate predictions for rna and proteins, with a probability of mislabeling

In [49]:
def writeSingleOutput(predictions, extenstiveName):


    extensiveOut = open(extenstiveName,"w+")
    names = list(labels_test.index)

    extensiveOut.write("sample,Test_gender,Test_msi, RandomForest_gender,SVM_gender,KNN_gender,ADA_gender,RandomForest_msi,SVM_msi,KNN_msi,ADA_msi,gender_mislabeled_prob,msi_mislabeld_prob,mislabeled_prob\n")

    for i in range(len(predictions[0])):
        gender_mislabeled = 0
        msi_mislabeled = 0

        for j in range(0,4):
            if (predictions[j][i] != labels_test.iloc[i,0]):
                gender_mislabeled += 1

        for j in range(4,8):
            if (predictions[j][i] != labels_test.iloc[i,1]):
                msi_mislabeled += 1

        msi_mislabeled /= 4.0
        gender_mislabeled /= 4.0

        mislabeled = gender_mislabeled
        if (msi_mislabeled > mislabeled):
            mislabeled = msi_mislabeled

        extensiveOut.write(names[i] + "," + labels_test.iloc[i,0] + "," + labels_test.iloc[i,1])
        for j in range(len(predictions)):
            extensiveOut.write("," + predictions[j][i])
        extensiveOut.write(", " + str(gender_mislabeled) + "," + str(msi_mislabeled) + "," + str(mislabeled))
        extensiveOut.write("\n")

    extensiveOut.close()


In [57]:
writeSingleOutput(predictions_prot, "results/extensive_prot_pca_features.csv")

In [58]:
writeSingleOutput(predictions_rna, "results/extensive_rna_pca_features.csv")