In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report, precision_score, f1_score, recall_score
from sklearn.feature_selection import RFE, RFECV
import warnings
import random

random.seed(1024)
sns.set()
#warnings.filterwarnings("ignore")


# Data Processing

### Load Data

In [2]:
data = pd.read_csv("./data/covid19.tsv", sep='\t', low_memory=False)
data.head(5)

Unnamed: 0,rowid,ccms_row_id,Algorithm,Filename,Cluster_index,Peptide,Unmodified_sequence,Charge,_dyn_#Intensity_for_cluster,_dyn_#Intensity_for_unmodified_sequence,...,PSP_site_match,DrugBank_drugs,Parent_mass,Num_PSP_Drugbank_events,Start_AA_1_based,End_AA_1_based,Num_spectra_for_cluster,Num_spectra_for_unmodified_sequence,Num_spectra_for_peptide_variant,Internal_ref_orig_intensity
0,1,1,.MODA.,specs_ms.mgf,960991,"K.[304.207]GARLIPEMDQIFTEVEMTTLE(K,304.207).V",.GARLIPEMDQIFTEVEMTTLEK.,4,36.905893,36.905893,...,,,1580.81,0,,,1,1,1,8204.159
1,2,2,.MODA.,specs_ms.mgf,763982,"I.[304.207]FTEVEMTTLE(K,304.207).V",.FTEVEMTTLEK.,3,11.686782,11.686782,...,,,1934.91,0,,,1,2,2,493689.4
2,3,3,.MSGFPLUS.,specs_ms.mgf,902201,K.[304.207]LYQPEYQEVSTEEQR.E,.LYQPEYQEVSTEEQR.,3,15.690234,15.690234,...,,,2203.09,0,,,5,6,6,195156.6
3,4,4,.MSGFPLUS.,specs_ms.mgf,935503,"K.[304.207]AANSLEAFIFETQD(K,304.207).L",.AANSLEAFIFETQDK.,3,15.016824,15.016824,...,,,2292.24,0,,,3,4,4,2877781.0
4,5,5,.MODA.,specs_ms.mgf,297961,"R.[304.207]YSHDF(N,-56.985)FH.I",.YSHDFNFH.,3,33.768015,33.768015,...,,,1313.66,0,,,3,3,3,70884.4


### Feature Processing

In [3]:
peptide = data[['Peptide'] + [c for c in data.columns if 'intensity_for_peptide_variant' in c]]
peptide = peptide.replace(0.0, np.nan)
peptide = peptide.set_index('Peptide')
peptide = peptide.T
peptide.index = peptide.index.map(lambda x: '.'.join(x.split('.')[:2]).replace("_dyn_#", ""))
peptide['label'] = peptide.index.map(lambda x: x.split('.')[0])

#process Nan value
#peptide = peptide.fillna(0.0) # fill nan with 0.0
peptide = peptide.dropna(axis=1)  #drop nan value
#peptide = peptide.fillna(peptide.mean(axis=0, skipna=True, numeric_only=True).fillna(0), axis=0)

In [4]:
peptide.shape

(92, 10631)

### Data Splitting

In [5]:
healthy, severe, non_severe, non_symptomatic = [], [], [], []
for row in peptide.iterrows():
    if row[1]['label'] == 'Healthy':
        healthy.append(list(row[1])[:-1])
    elif row[1]['label'] == 'Symptomatic-non-COVID-19':
        non_symptomatic.append(list(row[1])[:-1])
    elif row[1]['label'] == 'Non-severe-COVID-19':
        non_severe.append(list(row[1])[:-1])
    elif row[1]['label'] == 'Severe-COVID-19':
        severe.append(list(row[1])[:-1])

all_data = np.array(healthy + severe + non_severe + non_symptomatic)

def k_fold(data, k):
    x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], []
    index_healthy = list(range(0, 22))
    index_severe = list(range(22, 40))
    index_non_severe = list(range(40, 65))
    index_non_symptomatic = list(range(65, 90))
    for i in range(k):
        
        random.shuffle(index_healthy)
        random.shuffle(index_severe)
        random.shuffle(index_non_severe)
        random.shuffle(index_non_symptomatic)
        
        train_0, train_1, train_2, train_3 = 16, 14, 17, 17
        val_0, val_1, val_2, val_3 = 3, 2, 4, 4

        x_train.append(data[index_healthy[:train_0] + index_severe[:train_1] + \
                    index_non_severe[:train_2] + index_non_symptomatic[:train_3]])
        y_train.append(np.array([0] * train_0 + [1] * train_1 + [2] * train_2 + [3] * train_3))
        
        x_val.append(data[index_healthy[train_0: train_0 + val_0] + index_severe[train_1: train_1 + val_1] + \
                    index_non_severe[train_2: train_2 + val_2] + index_non_symptomatic[train_3: train_3 + val_3]])
        y_val.append(np.array([0] * val_0 + [1] * val_1 + [2] * val_2 + [3] * val_3))
        
        x_test.append(data[index_healthy[train_0 + val_0:] + index_severe[train_1 + val_1: ] + \
                    index_non_severe[train_2 + val_2: ] + index_non_symptomatic[train_3 + val_3:]])
        y_test.append(np.array([0] * val_0 + [1] * val_1 + [2] * val_2 + [3] * val_3))
        
    return x_train, y_train, x_val, y_val, x_test, y_test

X_train, Y_train,  X_val, Y_val, X_test, Y_test = k_fold(all_data, 20)

# Evaluation

In [6]:
def evaluation(Y_true, Y_pred):
    Y_true = np.array(Y_true)
    Y_pred = np.array(Y_pred)
    print("Confusion Matrix:\n", confusion_matrix(Y_true, Y_pred))
    print("Accuracy:", round(np.mean(Y_true == Y_pred),4))
    print("Precision:", round(precision_score(Y_true, Y_pred, average='weighted'),4))
    print("Recall:", round(recall_score(Y_true, Y_pred, average='weighted'),4))
    print("F1-score", round(f1_score(Y_true, Y_pred, average='weighted'),4))
    #print("Classification Report:\n", classification_report(Y_true, Y_pred))
    print("Cohen Kappa Score:", round(cohen_kappa_score(Y_true, Y_pred),4))
    print("\n"*3)

# Model

## Logistic Regression

### Original Test

In [7]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train, Y_train,  X_val, Y_val, X_test, Y_test):
    lr = LogisticRegression(solver='lbfgs', penalty='none')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[320   0   0   0]
 [  0 280   0   0]
 [  0   0 340   0]
 [  0   0   0 340]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[50  2  3  5]
 [ 0 24  8  8]
 [ 3 19 50  8]
 [ 9 15  2 54]]
Accuracy: 0.6846
Precision: 0.7134
Recall: 0.6846
F1-score 0.6926
Cohen Kappa Score: 0.5773




Prediction Results On Test Data!

Confusion Matrix:
 [[51  1  4  4]
 [ 0 13 20  7]
 [ 3 22 46  9]
 [10 13  2 55]]
Accuracy: 0.6346
Precision: 0.6469
Recall: 0.6346
F1-score 0.6394
Cohen Kappa Score: 0.5064






### L_1 Norm

In [8]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train, Y_train,  X_val, Y_val, X_test, Y_test):
    lr = LogisticRegression(penalty='l1', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[320   0   0   0]
 [  0 280   0   0]
 [  0   0 340   0]
 [  0   0   0 340]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[52  0  4  4]
 [ 1 23 10  6]
 [ 0 14 64  2]
 [12 22  2 44]]
Accuracy: 0.7038
Precision: 0.7325
Recall: 0.7038
F1-score 0.7087
Cohen Kappa Score: 0.6033




Prediction Results On Test Data!

Confusion Matrix:
 [[52  0  2  6]
 [ 0 24 14  2]
 [ 1  9 69  1]
 [ 7 17  0 56]]
Accuracy: 0.7731
Precision: 0.7887
Recall: 0.7731
F1-score 0.7771
Cohen Kappa Score: 0.6932






### L_2 Norm

In [9]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[320   0   0   0]
 [  0 280   0   0]
 [  0   0 340   0]
 [  0   0   0 340]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[50  2  4  4]
 [ 0 26  8  6]
 [ 3 14 61  2]
 [13 16  2 49]]
Accuracy: 0.7154
Precision: 0.7412
Recall: 0.7154
F1-score 0.7208
Cohen Kappa Score: 0.6186




Prediction Results On Test Data!

Confusion Matrix:
 [[56  0  1  3]
 [ 0 15 19  6]
 [ 2 16 58  4]
 [14 12  0 54]]
Accuracy: 0.7038
Precision: 0.7099
Recall: 0.7038
F1-score 0.7034
Cohen Kappa Score: 0.5993






## L1_norm Feature Reduction

In [10]:
Coeffienct = np.zeros(all_data.shape[1])
for x_train, y_train in zip(X_train, Y_train):
    lr = LogisticRegression(penalty='l1', solver='liblinear')
    lr.fit(x_train, y_train)
    
    coeffient = np.mean(np.abs(lr.coef_), axis=0)
    
    Coeffienct += coeffient

C = list(Coeffienct)
C.sort(reverse=True)
threshold = C[63]
Indexes = Coeffienct >= threshold

new_data = (all_data.T[Indexes]).T

X_train_l1, Y_train_l1, X_val_l1, Y_val_l1, X_test_l1, Y_test_l1 = k_fold(new_data, 100)

### Logistic Regression -L2

In [11]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_l1, Y_train_l1,  X_val_l1, Y_val_l1, X_test_l1, Y_test_l1):
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1600    0    0    0]
 [   0 1400    0    0]
 [   0    0 1700    0]
 [   0    0    0 1700]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[260   3   6  31]
 [  4 132  45  19]
 [  6  61 316  17]
 [ 40  43  18 299]]
Accuracy: 0.7746
Precision: 0.7824
Recall: 0.7746
F1-score 0.7772
Cohen Kappa Score: 0.695




Prediction Results On Test Data!

Confusion Matrix:
 [[274   4   2  20]
 [  3 133  40  24]
 [  3  67 315  15]
 [ 34  41  16 309]]
Accuracy: 0.7931
Precision: 0.8031
Recall: 0.7931
F1-score 0.7963
Cohen Kappa Score: 0.7203






## L2_norm Feature Reduction

In [12]:
Coeffienct = np.zeros(all_data.shape[1])
for x_train, y_train in zip(X_train, Y_train):
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train, y_train)
    
    coeffient = np.mean(np.abs(lr.coef_), axis=0)
    
    Coeffienct += coeffient

C = list(Coeffienct)
C.sort(reverse=True)
threshold = C[99]
Indexes = Coeffienct >= threshold

new_data = (all_data.T[Indexes]).T

X_train_l2, Y_train_l2, X_val_l2, Y_val_l2, X_test_l2, Y_test_l2 = k_fold(new_data, 100)

In [13]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_l2, Y_train_l2,  X_val_l2, Y_val_l2, X_test_l2, Y_test_l2):
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1600    0    0    0]
 [   0 1400    0    0]
 [   0    0 1700    0]
 [   0    0    0 1700]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[257   3  16  24]
 [  2 130  40  28]
 [  2  67 309  22]
 [ 28  49   3 320]]
Accuracy: 0.7815
Precision: 0.7938
Recall: 0.7815
F1-score 0.7861
Cohen Kappa Score: 0.7043




Prediction Results On Test Data!

Confusion Matrix:
 [[265   1  11  23]
 [  4 130  32  34]
 [  5  64 309  22]
 [ 31  42   1 326]]
Accuracy: 0.7923
Precision: 0.8019
Recall: 0.7923
F1-score 0.7954
Cohen Kappa Score: 0.7187






## Random Forest Reducion

In [14]:
Coeffienct = np.zeros(all_data.shape[1])
for x_train, y_train in zip(X_train, Y_train):
    rf = RandomForestClassifier(n_estimators=200, max_depth=5, criterion='entropy', max_features = 'sqrt')
    rf.fit(x_train, y_train)
    
    coeffient = rf.feature_importances_
    Coeffienct += coeffient


In [36]:
C = list(Coeffienct)
C.sort(reverse=True)
threshold = C[48]
Indexes = Coeffienct > threshold
rf_new_data = (all_data.T[Indexes]).T

X_train_rf, Y_train_rf, X_val_rf, Y_val_rf, X_test_rf, Y_test_rf = k_fold(rf_new_data, 100)

In [37]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_rf, Y_train_rf, X_val_rf, Y_val_rf, X_test_rf, Y_test_rf):
    rf = RandomForestClassifier(n_estimators=200, max_depth=5, criterion='entropy', max_features = 'sqrt')
    rf.fit(x_train, y_train)
    y_train_pred = rf.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = rf.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = rf.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1600    0    0    0]
 [   0 1400    0    0]
 [   0    0 1700    0]
 [   0    0    0 1700]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[287   0   0  13]
 [  0 119  55  26]
 [ 10  14 373   3]
 [ 21  25   0 354]]
Accuracy: 0.8715
Precision: 0.8674
Recall: 0.8715
F1-score 0.8675
Cohen Kappa Score: 0.824




Prediction Results On Test Data!

Confusion Matrix:
 [[282   0   0  18]
 [  0 136  35  29]
 [ 12  20 361   7]
 [ 22  23   0 355]]
Accuracy: 0.8723
Precision: 0.8704
Recall: 0.8723
F1-score 0.8708
Cohen Kappa Score: 0.8256






In [38]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_rf, Y_train_rf, X_val_rf, Y_val_rf, X_test_rf, Y_test_rf):
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1600    0    0    0]
 [   0 1399    1    0]
 [   0    0 1700    0]
 [   0   57    0 1643]]
Accuracy: 0.9909
Precision: 0.9913
Recall: 0.9909
F1-score 0.991
Cohen Kappa Score: 0.9879




Prediction Results On Validation Data!

Confusion Matrix:
 [[263   0   1  36]
 [  0 165  20  15]
 [  2  14 377   7]
 [ 25  60   0 315]]
Accuracy: 0.8615
Precision: 0.8668
Recall: 0.8615
F1-score 0.8629
Cohen Kappa Score: 0.8123




Prediction Results On Test Data!

Confusion Matrix:
 [[275   0   2  23]
 [  1 178   7  14]
 [  6  12 373   9]
 [ 26  52   1 321]]
Accuracy: 0.8823
Precision: 0.888
Recall: 0.8823
F1-score 0.8834
Cohen Kappa Score: 0.8408






# GBDT

In [35]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_rf, Y_train_rf, X_val_rf, Y_val_rf, X_test_rf, Y_test_rf):
    gb = SVC()# GradientBoostingClassifier(n_estimators=8, max_depth=1, learning_rate=0.01)
    gb.fit(x_train, y_train)
    y_train_pred = gb.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = gb.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = gb.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))
    
print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1545    0    0   55]
 [ 235  449  467  249]
 [ 139   58 1347  156]
 [ 388   52   48 1212]]
Accuracy: 0.7114
Precision: 0.7278
Recall: 0.7114
F1-score 0.6898
Cohen Kappa Score: 0.6122




Prediction Results On Validation Data!

Confusion Matrix:
 [[269   0   0  31]
 [ 30  49  74  47]
 [ 64  34 248  54]
 [129  36  23 212]]
Accuracy: 0.5985
Precision: 0.6003
Recall: 0.5985
F1-score 0.5842
Cohen Kappa Score: 0.4541




Prediction Results On Test Data!

Confusion Matrix:
 [[265   0   0  35]
 [ 38  43  81  38]
 [ 47  22 285  46]
 [129  26  29 216]]
Accuracy: 0.6223
Precision: 0.6208
Recall: 0.6223
F1-score 0.6039
Cohen Kappa Score: 0.4836






# PCA

In [21]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    pca = PCA(n_components = 64)
    pca.fit(x_train)
    x_train_pca = pca.transform(x_train)
    x_val_pca = pca.transform(x_val)
    x_test_pca = pca.transform(x_test)
    
    lr = LogisticRegression(penalty='l2', solver='liblinear')
    lr.fit(x_train_pca, y_train)
    y_train_pred = lr.predict(x_train_pca)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val_pca)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test_pca)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[320   0   0   0]
 [  0 280   0   0]
 [  0   0 340   0]
 [  0   0   0 340]]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score 1.0
Cohen Kappa Score: 1.0




Prediction Results On Validation Data!

Confusion Matrix:
 [[50  4  1  5]
 [ 0 30  6  4]
 [ 3 26 49  2]
 [15 21  0 44]]
Accuracy: 0.6654
Precision: 0.742
Recall: 0.6654
F1-score 0.6789
Cohen Kappa Score: 0.5599




Prediction Results On Test Data!

Confusion Matrix:
 [[55  1  1  3]
 [ 0 22 14  4]
 [ 2 27 46  5]
 [13 19  1 47]]
Accuracy: 0.6538
Precision: 0.7038
Recall: 0.6538
F1-score 0.6648
Cohen Kappa Score: 0.5408






# LDA

In [27]:
Coeffienct = np.zeros(all_data.shape[1])
for x_train, y_train in zip(X_train, Y_train):
    lda = LinearDiscriminantAnalysis(n_components=1)
    lda.fit(x_train, y_train)
    
    coeffient = np.mean(np.abs(lda.coef_), axis=0)
    
    Coeffienct += coeffient

C = list(Coeffienct)
C.sort(reverse=True)
threshold = C[63]
Indexes = Coeffienct >= threshold

new_data = (all_data.T[Indexes]).T

X_train_lda, Y_train_lda, X_val_lda, Y_val_lda, X_test_lda, Y_test_lda = k_fold(new_data, 100)

In [28]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train_lda, Y_train_lda, X_val_lda, Y_val_lda, X_test_lda, Y_test_lda):
    
    lr = LogisticRegression(penalty='l1', solver='liblinear')
    lr.fit(x_train, y_train)
    y_train_pred = lr.predict(x_train)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[1501   40    1   58]
 [   1  479  625  295]
 [   1   20 1615   64]
 [ 132   79  267 1222]]
Accuracy: 0.7527
Precision: 0.7681
Recall: 0.7527
F1-score 0.7342
Cohen Kappa Score: 0.6671




Prediction Results On Validation Data!

Confusion Matrix:
 [[244   8   1  47]
 [  1   6 139  54]
 [  4  34 282  80]
 [ 68  63 159 110]]
Accuracy: 0.4938
Precision: 0.4516
Recall: 0.4938
F1-score 0.4633
Cohen Kappa Score: 0.3011




Prediction Results On Test Data!

Confusion Matrix:
 [[249  11   0  40]
 [  1   7 130  62]
 [  6  36 264  94]
 [ 65  76 138 121]]
Accuracy: 0.4931
Precision: 0.4574
Recall: 0.4931
F1-score 0.4698
Cohen Kappa Score: 0.3024






In [7]:
Y_train_pred = []
Y_train_true = []
Y_val_pred = []
Y_val_true = []
Y_test_pred = []
Y_test_true = []
for x_train, y_train,  x_val, y_val, x_test, y_test in zip(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    lda = LinearDiscriminantAnalysis(n_components=3)
    lda.fit(x_train, y_train)
    x_train_lda = lda.transform(x_train)
    x_val_lda = lda.transform(x_val)
    x_test_lda = lda.transform(x_test)
    
    lr = LogisticRegression(penalty='none', max_iter=1000)
    lr.fit(x_train_lda, y_train)
    y_train_pred = lr.predict(x_train_lda)
    Y_train_pred += list(y_train_pred)
    Y_train_true += list(y_train.reshape(-1))
    
    y_val_pred = lr.predict(x_val_lda)
    Y_val_pred += list(y_val_pred)
    Y_val_true += list(y_val.reshape(-1))
    
    y_test_pred = lr.predict(x_test_lda)
    Y_test_pred += list(y_test_pred)
    Y_test_true += list(y_test.reshape(-1))

print("Prediction Results On Training Data!\n")
evaluation(Y_train_true, Y_train_pred)

print("Prediction Results On Validation Data!\n")
evaluation(Y_val_true, Y_val_pred)

print("Prediction Results On Test Data!\n")
evaluation(Y_test_true, Y_test_pred)

Prediction Results On Training Data!

Confusion Matrix:
 [[276   7  20  17]
 [  3 234  23  20]
 [ 15   9 281  35]
 [ 16  28  22 274]]
Accuracy: 0.832
Precision: 0.8328
Recall: 0.832
F1-score 0.8323
Cohen Kappa Score: 0.7755




Prediction Results On Validation Data!

Confusion Matrix:
 [[46  0  7  7]
 [ 0 16 15  9]
 [10 13 47 10]
 [19 14 23 24]]
Accuracy: 0.5115
Precision: 0.5037
Recall: 0.5115
F1-score 0.4983
Cohen Kappa Score: 0.3399




Prediction Results On Test Data!

Confusion Matrix:
 [[46  0  7  7]
 [ 0 21 12  7]
 [10 20 38 12]
 [26 16 11 27]]
Accuracy: 0.5077
Precision: 0.5148
Recall: 0.5077
F1-score 0.4991
Cohen Kappa Score: 0.3438






In [77]:
peptide = data[['Peptide'] + [c for c in data.columns if 'intensity_for_peptide_variant' in c]]
peptide = peptide.set_index('Peptide')
peptide = peptide.T
peptide.index = peptide.index.map(lambda x: '.'.join(x.split('.')[:2]).replace("_dyn_#", ""))

In [82]:
np.array(list(peptide.columns))[Indexes]

array(['R.{290.173}[304.207]EGT(C,57.021)PEAPTDE(C,57.021)(K,304.207)PV(K,304.207).W',
       'K.[304.207](V,290.171)DNALQSGNSQESVTEQDS(K,304.207).D',
       'K.[304.207](P,238.418)(K,304.207)DTLMISR.T',
       'R.{291.173}[304.207]ILGGHLDA(K,304.207).G',
       'K.[304.207](Y,116.074)LYETTLE(K,304.207).C',
       'K.[304.207](K,376.233)VPQVSTPTLVEVSR.N',
       'K.[304.207]AEFAEVS(K,282.164)LVT.D',
       'K.{311.984}[304.207]YI(C,57.021)ENQDSI.S',
       'K.[304.207]L(K,331.18)E(C,57.021)(C,57.021)E(K,304.207)PLLE(K,304.207).S',
       'S.[236.123]LHTLFGD(K,304.207).L',
       'K.[304.207]SLHTLFGD(K,332.213)L(C,57.021)TVA.T',
       'K.[304.207]SLHTLFGD(K,330.707)L(C,57.021)TVA.T',
       'D.[304.207](K,203.075)SLHTLFGD(K,304.207).L',
       'K.[304.207](T,291.174)(C,57.021)VADESAEN(C,57.021)D(K,304.207).S',
       'K.[304.207](S,144.078)LHTLFGD(K,304.207).L',
       'K.[304.207](T,-13.036)(C,57.021)VADESAEN(C,57.021)D(K,304.207).S',
       'R.[304.207](F,-13.032)(K,304.207)DLGEENF(K