<a href="https://colab.research.google.com/github/ErenB02/Proteomics_Project/blob/main/MSC_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [322]:
##Libraries
from sklearn import model_selection
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np


# **Cord Blood Analysis Models**


### **Import datasets**


In [323]:
"""MSc Project 2025
   Eren Boybay
   01/08/2025"""

##BASELINE (cord blood)
#Full set of proteins
BL_prot = pd.read_csv("BL_prot_full.csv")

# #Full set of peptides
# BL_pep_full = pd.read_csv(url)
BL_pep = pd.read_csv("BL_pep_full.csv")


# #Proteins + Clinical (combined)
# BL_prot_comb = pd.read_csv(url)


# #Peptides + Clinical (combined)
# BL_pep_comb = pd.read_csv(url)



In [324]:
# #Check column classes and format
print(BL_prot.value_counts('Condition'))
print(BL_prot.iloc[0:5, 0:5])

#Remove BioReplicate Column as it isnt needed
print(BL_prot.shape)
del BL_prot['BioReplicate']
print(BL_prot.shape)

Condition
Control    42
Case       22
Name: count, dtype: int64
       KLKB1       HEP2       A2MG        IC1       A2AP
0  23.726412  23.995709  23.184520  22.173751  22.113733
1  24.327342  23.748462  24.511435  23.598822  23.028244
2  24.783129  24.488957  24.415013  24.568131  25.042863
3  23.847265  24.124445  24.272973  23.822803  23.993504
4  24.368995  23.986996  24.373690  23.814763  23.623496
(64, 38)
(64, 37)


In [325]:
# #Check column classes and format (peptide)
print(BL_pep.value_counts('Condition'))
print(BL_pep.iloc[0:5, 0:5])

#Remove BioReplicate Column as it isnt needed
print(BL_pep.shape)
del BL_pep['BioReplicate']
print(BL_pep.shape)

Condition
Control    42
Case       22
Name: count, dtype: int64
   BioReplicate  GGDVASMYTPNAQYCQMR  TGAVSGHSLK  QCGHQISACHR  VSSVEECQK
0             1           13.292609   13.060020    13.927130  11.903882
1             2           14.851895   14.655922    15.227240  14.703525
2             3           15.444724   13.627306    14.437362  11.945444
3             4           14.411643   14.575835    11.869979  15.891048
4             5           15.286774   14.721847    12.899735  16.821662
(64, 1090)
(64, 1089)


### **Pre-processing & Feature Selection**

In [326]:
#Remove trailing space (found in 'Case ')
BL_prot['Condition'] = BL_prot['Condition'].str.strip()
#Convert cat variables to numerical ones (Case = 1, Control  0)
BL_prot.replace({'Condition': {'Control': 0, 'Case': 1}}, inplace=True)


#Repeat for peptide
BL_pep['Condition'] = BL_pep['Condition'].str.strip()
#Convert cat variables to numerical ones (Case = 1, Control  0)
BL_pep.replace({'Condition': {'Control': 0, 'Case': 1}}, inplace=True)


  BL_prot.replace({'Condition': {'Control': 0, 'Case': 1}}, inplace=True)
  BL_pep.replace({'Condition': {'Control': 0, 'Case': 1}}, inplace=True)


*Features and Target*

In [327]:
#Features (all proteins)
X_prot =  BL_prot.iloc[:,:36]
#Label (Case - Control (object))
y_prot = BL_prot['Condition']
print(X_prot.shape)
print(y_prot.shape)


#Features (all peptides)
X_pep =  BL_pep.iloc[:,:1088]
y_pep = BL_pep['Condition']
print(X_pep.shape)
print(y_pep.shape)

(64, 36)
(64,)
(64, 1088)
(64,)


In [328]:
#Check for NaN values
print(f"There are {X_prot.isna().sum().sum()} NaN values present in the peptide features")
print(f"There are {X_pep.isna().sum().sum()} NaN values present in the peptide features")

X_pep = X_pep.fillna(0)


There are 0 NaN values present in the peptide features
There are 2 NaN values present in the peptide features


### **Model Training and Testing**



In [329]:
##Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


lrc = LogisticRegression(penalty="l1", solver="liblinear", random_state=25)

#5 fold cross validation
lr_prot_roc = cross_val_score(lrc, X_prot, y_prot, cv=5, scoring='roc_auc')
lr_pep_roc = cross_val_score(lrc, X_pep, y_pep, cv=5, scoring='roc_auc')


print(f"ROC-AUC scores in protein data: {lr_prot_roc}")
print(f"Average ROC-AUC scores for protein data: {np.mean(lr_prot_roc):.2f}")

print(f"ROC-AUC scores in peptide data: {lr_pep_roc}")
print(f"Average ROC-AUC scores for peptide data: {np.mean(lr_pep_roc):.3f}")



ROC-AUC scores in protein data: [0.41666667 0.38888889 0.6        0.6        0.46875   ]
Average ROC-AUC scores for protein data: 0.49
ROC-AUC scores in peptide data: [0.80555556 0.52777778 0.625      0.775      0.5625    ]
Average ROC-AUC scores for peptide data: 0.659


In [331]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(criterion="gini", random_state=25)

#5 fold cross validation
rf_prot_roc = cross_val_score(rfc, X_prot, y_prot, cv=5, scoring='roc_auc')
rf_pep_roc = cross_val_score(rfc, X_pep, y_pep, cv=5, scoring='roc_auc')


print(f"ROC-AUC scores in protein data: {rf_prot_roc}")
print(f"Average ROC-AUC scores for protein data: {np.mean(rf_prot_roc):.2f}")

print(f"ROC-AUC scores in peptide data: {rf_pep_roc}")
print(f"Average ROC-AUC scores for peptide data: {np.mean(rf_pep_roc):.3f}")


ROC-AUC scores in protein data: [0.48611111 0.125      0.8875     0.75       0.59375   ]
Average ROC-AUC scores for protein data: 0.57
ROC-AUC scores in peptide data: [0.54166667 0.18055556 0.65       0.6625     0.6875    ]
Average ROC-AUC scores for peptide data: 0.544


*Plotting results*

In [None]:
import matplotlib.pyplot as plt


# **Serum Analysis Models**

In [None]:
##PiRAMiD (serum)
#Full set of proteins
PM_prot_full = pd.read_csv(url)

#Full set of peptides
PM_pep_full = pd.read_csv(url)

#Proteins + Clinical (combined)
PM_prot_comb = pd.read_csv(url)
#Peptides + Clinical (combined)
PM_pep_comb = pd.read_csv(url)


NameError: name 'url' is not defined