#Proteome profiling preprocessing

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [None]:
# Read unpreprocessed file of proteome profiling
proteome = pd.read_csv("proteomeprofiling.csv")

# Drop all columns that not correspond to sample IDs
proteome = proteome.drop("AGID",axis=1)
proteome = proteome.drop("Unnamed: 0",axis=1)
proteome = proteome.drop("lab_id",axis=1)
proteome = proteome.drop("catalog_number",axis=1)
proteome = proteome.drop("set_id",axis=1)

#make peptide_target (protein names) as indicies
proteome.set_index('peptide_target', inplace=True)

#Transpose the dataframe
prot = proteome.T

In [None]:
# Remove features that have zero expression in less than 10% of samples
na_percentages = prot.isna().mean()
#print(na_percentages)
features_to_drop = na_percentages[na_percentages > 0.10].index
df = prot.drop(features_to_drop, axis=1)

##checking remaining missing values in data
missing_values = df.isna().sum().sum()
print(missing_values)

In [None]:
#Impute missing values using K-Nearest Neighbour =5
imputer = KNNImputer(n_neighbors=5)

# Impute missing values
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)
missing_values_after = df_imputed.isna().sum().sum()
print(missing_values_after)

#save the preprocessed proteomeprofiling dataframe
df_imputed.to_csv("proteomeprofiling.csv")