In [24]:
# Tutorial: https://www.analyticsvidhya.com/blog/2021/01/a-guide-to-the-naive-bayes-algorithm/
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [25]:
# Configuration

discovery_data_file = '../../data/Supp_Table_6_filtered_lfq_discovery.csv'

validation_data_file = '../../data/Supp_Table_3_lfq_intensities_validation.csv'
clinical_data_file = '../../data/Supp_Table_1_clinical_data.csv'


nine_prot_classifier = ['ENPP3', 'IVL', 'S100A2', 'MYH11', 'SERPINB5', 'NNMT', 'CLCA4', 'CD109', 'S100A14']
my_classifier = ['S100A2', 'S100A14', 'SERPINB5', 'S100P', 'NNMT', 'MIF', 'HSPA2', 'TPPP3', 'SYNGR2']
classifier = nine_prot_classifier

test_size = 0.15
random_state = 109

In [26]:
# Read discovery data 

gene_df = pd.read_csv(discovery_data_file, sep=';', header=0)
gene_df = gene_df.drop(columns=['Razor + unique peptides', 'Unique peptides','Q-value', 
                      'Score', 'Intensity', 'MS/MS count', 'Protein IDs',
       'Majority protein IDs', 'Protein names', 'Column1', 'Column2',
       'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8',
       'Column9', 'Column10', 'Column11', 'Column12'])
gene_df = gene_df.fillna(value="labels", limit=1)
gene_df = gene_df.set_index('Gene names')
gene_df = gene_df.transpose()
mapping = {'Healthy': 0, 'Patient': 1}
gene_df = gene_df.replace({'labels': mapping})

display(gene_df.head(5))

Gene names,A1BG,A2M,AARS,ABCE1,ABCF1,ABHD14B,ABI1,ABR,ACADVL,ACAP2,...,YWHAE,YWHAG,YWHAH,YWHAQ,YWHAZ,ZAK;pk,ZC3HAV1,ZNF185,ZYX,labels
LFQ intensity BUL_103,25.39842,26.66044,23.51749,22.38195,20.65106,23.94708,22.80967,19.92877,23.40912,20.6044,...,27.00389,24.42918,23.72848,25.59167,27.46972,21.48316,21.66006,21.59405,23.42522,0
LFQ intensity BUL_30,28.04843,29.83939,24.725,21.4001,18.52887,25.35803,19.56752,18.04529,20.12199,19.46165,...,28.6475,25.80799,24.62271,27.62819,29.19315,21.95957,19.84275,20.02877,20.17072,0
LFQ intensity BUL_40,24.33402,25.49622,23.27213,22.20973,19.87752,23.89171,22.45669,21.20985,22.55173,20.05775,...,28.13482,25.42211,25.38281,26.26785,28.2209,19.78749,22.28619,22.10188,21.40671,0
LFQ intensity BUL_47,26.15509,29.6653,18.98055,19.4373,20.06988,23.32896,21.12136,17.5265,21.38506,20.32839,...,26.75646,23.68603,24.00967,24.367,28.00657,19.62387,18.94378,21.4315,22.04613,1
LFQ intensity BUL_48,25.64636,26.88064,22.68656,21.88852,20.69693,24.13485,22.49685,20.13295,22.7297,20.86507,...,27.03444,24.14499,24.0748,25.49063,27.62415,22.0415,20.7982,22.42206,22.2678,1


In [27]:
# Read Validation Data

gene_df = pd.read_csv(validation_data_file, sep=';', header=0)
gene_df = gene_df.drop(columns=['Razor + unique peptides', 'Unique peptides','Q-value', 
                      'Score', 'Intensity', 'MS/MS count', 'Protein IDs',
       'Majority protein IDs', 'Protein names', 'Column1', 'Column2',
       'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8',
       'Column9', 'Column10', 'Column11', 'Column12', 'Column13', 'Sequence coverage [%]', 'Mol. weight [kDa]' ])
gene_df = gene_df.drop(columns=['LFQ intensity MUL_38','LFQ intensity BUL_109','LFQ intensity UL_19',
    'LFQ intensity UL_22','LFQ intensity UL_3','LFQ intensity UL_23','LFQ intensity BUL_101','LFQ intensity UL_37',
    'LFQ intensity UL_35','LFQ intensity MUL_38','LFQ intensity MUL_69'])

gene_names = list(gene_df.columns)

gene_df = gene_df.fillna(value=0)
gene_df = gene_df.set_index('Gene names')
gene_df = gene_df.transpose()


clinical_data_df = pd.read_csv('../../data/Supp_Table_1_clinical_data.csv', sep=';', header=1)
clinical_data_df = clinical_data_df[clinical_data_df["Set"] == 'Validation']
clinical_data_df['Sample ID'] = clinical_data_df['Sample ID'].str.replace('-','_')
sample_col_list = clinical_data_df['Sample ID'].tolist()
sample_col_list = [('LFQ intensity '+ sample) for sample in sample_col_list]
clinical_data_df['samples'] = sample_col_list
clinical_data_df = clinical_data_df[['samples', 'Signature prediction2']]
### 'LFQ intensity UL_37' kommt in der Tabelle 3 nicht vor -> Warum auch immer?
clinical_data_df = clinical_data_df[clinical_data_df["samples"] != 'LFQ intensity UL_37']
clinical_data_df = clinical_data_df.set_index('samples')
clinical_data_df = clinical_data_df.rename(columns={'Signature prediction2': 'labels'})

gene_df = gene_df.join(clinical_data_df)
mapping = {'Healthy': 0, 'Patient': 1}
gene_df = gene_df.replace({'labels': mapping})

display(gene_df.head(5))

Unnamed: 0,A1BG,A2M,A2ML1,A4GALT,AAAS,AACS,AAGAB,AAK1,AAK1.1,AAMDC,...,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,labels
LFQ intensity BUL_1,31.395758,36.772396,27.792532,0.0,0.0,22.145134,0.0,0.0,0.0,23.470604,...,0.0,27.319414,29.749304,22.809332,0.0,31.016443,24.910404,28.295309,24.061346,1
LFQ intensity BUL_10,32.131454,35.44976,31.344879,25.675083,0.0,0.0,0.0,0.0,0.0,24.782917,...,0.0,29.630342,25.440632,0.0,0.0,0.0,0.0,25.624609,24.216824,1
LFQ intensity BUL_102,32.093391,34.85944,29.944473,0.0,23.829971,0.0,0.0,0.0,0.0,24.514847,...,0.0,29.479427,27.57766,0.0,0.0,31.627102,0.0,28.808054,26.070568,1
LFQ intensity BUL_11,33.692703,33.655441,26.212952,0.0,0.0,0.0,0.0,0.0,0.0,24.164536,...,0.0,25.615097,28.372355,0.0,0.0,31.508053,0.0,24.047102,24.458326,1
LFQ intensity BUL_111,30.580734,33.720184,31.121702,24.418991,0.0,22.818605,0.0,0.0,0.0,0.0,...,0.0,29.538399,25.75799,0.0,0.0,29.498859,0.0,0.0,0.0,1


In [28]:
y = gene_df['labels']
X = gene_df[classifier]
display(X)

Unnamed: 0,ENPP3,IVL,S100A2,S100A2.1,MYH11,MYH11.1,MYH11.2,SERPINB5,NNMT,CLCA4,CD109,S100A14
LFQ intensity BUL_1,33.281540,27.796001,28.780222,27.951216,29.613686,35.250782,0.0,26.934383,26.234308,29.090860,26.787310,31.119478
LFQ intensity BUL_10,27.685320,30.505499,29.826689,32.442554,0.000000,25.852957,0.0,31.157681,27.504230,32.288872,25.001614,30.841986
LFQ intensity BUL_102,19.782415,29.457148,30.362648,26.164497,0.000000,23.531719,0.0,30.237516,25.266315,32.681709,21.803846,31.451353
LFQ intensity BUL_11,25.356859,26.189409,28.247896,0.000000,0.000000,25.869902,0.0,28.497166,25.257710,27.101694,21.098219,29.326696
LFQ intensity BUL_111,24.107492,30.764538,27.413771,0.000000,0.000000,23.519894,0.0,31.816769,27.345959,33.091076,20.947851,32.394047
...,...,...,...,...,...,...,...,...,...,...,...,...
LFQ intensity UL_5,22.596325,0.000000,23.113943,0.000000,26.645264,34.242748,0.0,24.089340,25.462763,0.000000,22.862022,25.186691
LFQ intensity UL_53,34.695351,29.105721,25.904606,0.000000,0.000000,29.537327,0.0,27.156149,25.237904,29.102720,24.973906,29.314646
LFQ intensity UL_7,20.557400,26.744324,27.017334,0.000000,28.833612,34.999256,0.0,29.092827,25.565231,25.346787,23.522888,27.633118
LFQ intensity UL_8,33.147057,0.000000,0.000000,0.000000,0.000000,32.744747,0.0,0.000000,28.388374,21.186163,23.377428,26.520119


In [29]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [30]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))


from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Model Sensitivity
print(f"Sensitivity im Paper 70% (== recall): {tp/ (tp+fn)}")

# Model Specifity
print(f"Specifity im Paper 76.2%: {tn / (tn+fp)}")

[[13  0]
 [ 8  2]]
Accuracy: 0.6521739130434783
Precision: 1.0
Recall: 0.2
Sensitivity im Paper 70% (== recall): 0.2
Specifity im Paper 76.2%: 1.0
