In [1]:
#
# Library
#

import pandas as pd
import numpy as np
import sys
from functools import reduce

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

In [2]:
#
# Constants
#

xq_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\ALDH4\WorkingFiles\Xq_minus_X_norm.tsv"
xm_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\WorkingFiles\Xm_norm.tsv"

In [3]:
xq = pd.read_csv(xq_path, sep='\t', index_col=0)
xm = pd.read_csv(xm_path, sep='\t', index_col=0)

In [4]:
mdata = pd.read_csv(
    r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\ALDH4\WorkingFiles\main_metadata.tsv",
    sep='\t'
    )
mdata.index = mdata['Seqn']

In [5]:
#
# Take significant features using F-test: feature vs Group 
#

sxq = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\ALDH4\Xq_stats.tsv", 
                  sep='\t', index_col=0, header=[0,1,2])

sxm = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\ALDH4\Xm_stats.tsv", 
                  sep='\t', index_col=0, header=[0,1,2])

In [6]:
pThr = 0.05

qsig = sxq.index[np.logical_and.reduce([
    sxq[('Group', 'anova', 'pvalue')] < pThr,
    np.logical_or.reduce([
        sxq[('Group', 'anova', 'PBS_vs_A12')] < pThr,
        sxq[('Group', 'anova', 'B1-8_vs_A12')] < pThr
    ])
])]

print(f"Proteins selected: {qsig.shape[0]} / {sxq.shape[0]}")

msig = sxm.index[np.logical_and.reduce([
    sxm[('Group', 'anova', 'pvalue')] < pThr,
    np.logical_or.reduce([
        sxm[('Group', 'anova', 'PBS_vs_A12')] < pThr,
        sxm[('Group', 'anova', 'B1-8_vs_A12')] < pThr
    ])
])]
print(f"Features selected: {msig.shape[0]} / {sxm.shape[0]}")

Proteins selected: 279 / 4118
Features selected: 226 / 2935


In [7]:
#
# Select features using machine learning
#

from sklearn.feature_selection import SelectFromModel

In [8]:
#
# Feature selection using Random Forest 
#

# from sklearn.feature_selection import SelectFromModel
# from sklearn.ensemble import RandomForestClassifier


# sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=0))
# sel.fit(xq, mdata.loc[xq.index, 'Control'])
# qrfc = xq.columns[(sel.get_support())]
# print(f"Novel proteins: {(~np.isin(qrfc, qsig)).sum()} / {len(qrfc)}")

# sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=0))
# sel.fit(xm, mdata.loc[xm.index, 'Control'])
# mrfc = xm.columns[(sel.get_support())]
# print(f"Novel features: {(~np.isin(mrfc, msig)).sum()} / {len(mrfc)}")

In [9]:
from sklearn.svm import LinearSVC
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import LeaveOneOut

distributions = dict(C=uniform(loc=0, scale=2), penalty=['l1'])

lsvc = LinearSVC(dual=False, tol=10**5, loss='squared_hinge', random_state=0)
clf = RandomizedSearchCV(
    estimator=lsvc, 
    param_distributions=distributions,
    cv=LeaveOneOut().split(xq),
    refit=True,
    random_state=0
    )
search = clf.fit(xq, mdata.loc[xq.index, 'Control'])
sel = SelectFromModel(search.best_estimator_)
qsvc = xq.columns[sel.get_support()]

print(f"Accuracy: {search.best_estimator_.score(xq, mdata.loc[xq.index, 'Control'])}")
print('Besta params', search.best_params_)
print(f"Number of proteins selected: {len(qsvc)} / {xq.shape[1]}")
#print('Selected features:', xq.columns[sel.get_support()])

print()

clf = RandomizedSearchCV(
    estimator=lsvc, 
    param_distributions=distributions,
    cv=LeaveOneOut().split(xm),
    refit=True,
    random_state=0
    )
search = clf.fit(xm, mdata.loc[xm.index, 'Control'])
sel = SelectFromModel(search.best_estimator_)
msvc = xm.columns[sel.get_support()]

print(f"Accuracy: {search.best_estimator_.score(xm, mdata.loc[xm.index, 'Control'])}")
print('Besta params', search.best_params_)
print(f"Number of proteins selected: {len(msvc)} / {xm.shape[1]}")

Accuracy: 1.0
Besta params {'C': 1.0976270078546495, 'penalty': 'l1'}
Number of proteins selected: 50 / 4118

Accuracy: 1.0
Besta params {'C': 1.0976270078546495, 'penalty': 'l1'}
Number of proteins selected: 93 / 2935


In [10]:
#
# Features selected until now
#

tmp = reduce(np.union1d, [qsig, qsvc]).shape[0]
print(f"Proteins selected: {tmp} | 1000-{tmp} = {1000-tmp}")
tmp = reduce(np.union1d, [msig, msvc]).shape[0]
print(f"Features selected: {tmp} | 1000-{tmp} = {1000-tmp}")

Proteins selected: 317 | 1000-317 = 683
Features selected: 304 | 1000-304 = 696


In [11]:
#
# Sparse PCA
#

from sklearn.decomposition import SparsePCA

def mySPCA(x, n_components=5, alpha=10):
    transformer = SparsePCA(n_components=n_components, random_state=0, verbose=False, n_jobs=-1, alpha=alpha)
    transformer.fit(x)

    print('Number of non-zero features:')
    _ = [print(f'Component {n+1}: {i} features') for n,i in enumerate((transformer.components_!=0).sum(axis=1))]
    print(f'Total number of features: {np.logical_or.reduce(transformer.components_!=0).sum()}')

    # Explained variance in the latent space

    P = transformer.components_.T  # loadings
    T = transformer.transform(x)  # score
    Xc = x - x.mean(axis=0)  # center data

    explained_variance = np.trace(P @ T.T @ T @ P.T)
    total_variance = np.trace(Xc.T @ Xc)
    print('Fraction explained variance in latent space:', explained_variance / total_variance)

    return transformer, transformer.feature_names_in_[(transformer.components_!=0).any(axis=0)]


In [12]:
spca_q, qpca = mySPCA(xq, 5, 2.8)

Number of non-zero features:
Component 1: 250 features
Component 2: 274 features
Component 3: 82 features
Component 4: 112 features
Component 5: 3 features
Total number of features: 717
Fraction explained variance in latent space: 0.10157821848798573


In [13]:
spca_m, mpca = mySPCA(xm, 5, 3.2)


Number of non-zero features:
Component 1: 249 features
Component 2: 169 features
Component 3: 90 features
Component 4: 67 features
Component 5: 178 features
Total number of features: 748
Fraction explained variance in latent space: 0.14657044326799226


In [14]:
#
# Summary
#

In [15]:
#
# Check PCA
#

from PCA_UMAP import PCA_UMAP, PCA_Var

In [16]:
tmp = reduce(np.union1d, [qsig, qsvc, qpca])
print(f'Proteins selected by F-test: {qsig.shape[0]}')
#print(f'Proteins selected by RFC: {qrfc.shape[0]}')
print(f'Proteins selected by SVC: {qsvc.shape[0]}')
print(f'Proteins selected by sPCA: {qpca.shape[0]}')
print(f'Total number of features: {tmp.shape[0]}')
PCA_Var(xq[tmp], mdata, [], ['Group', 'Ig', 'Control','TMT'], n_comp=10)

Proteins selected by F-test: 279
Proteins selected by SVC: 50
Proteins selected by sPCA: 717
Total number of features: 999


Unnamed: 0,%Var PCA,Group,Ig,Control,TMT
1,33.108358,0.3449,0.5204,0.4441,0.6761
2,24.985595,0.0826,0.7555,0.0389,0.6601
3,12.644785,0.0053,0.0015,0.0369,0.9555
4,6.670894,0.6187,0.8934,0.4431,0.9148
5,4.703899,0.1327,0.3677,0.0409,0.7737
6,3.918167,0.0733,0.0778,0.8098,0.7458
7,2.595158,0.9004,0.7382,0.9186,0.9352
8,2.117437,0.6661,0.3605,0.5829,0.9877
9,1.946206,0.5865,0.296,0.6679,0.8954
10,1.765661,0.9372,0.9122,0.7217,0.7446


In [17]:
tmp = reduce(np.union1d, [msig, msvc, mpca])
print(f'Features selected by F-test: {msig.shape[0]}')
#print(f'Features selected by RFC: {mrfc.shape[0]}')
print(f'Features selected by SVC: {msvc.shape[0]}')
print(f'Features selected by sPCA: {mpca.shape[0]}')
print(f'Total number of features: {tmp.shape[0]}')
PCA_Var(xm[tmp], mdata, [], ['Group', 'Ig', 'Control'], n_comp=10)

Features selected by F-test: 226
Features selected by SVC: 93
Features selected by sPCA: 748
Total number of features: 992


Unnamed: 0,%Var PCA,Group,Ig,Control
1,30.619708,0.5013,0.7144,0.4319
2,24.226354,0.0088,0.5522,0.0033
3,12.121718,0.3733,0.4089,0.5758
4,7.52417,0.7221,0.8369,0.4291
5,4.455311,0.0001,0.0,0.0429
6,3.719203,0.2187,0.4845,0.3169
7,2.847143,0.758,0.9617,0.5298
8,2.25791,0.6435,0.3397,0.6384
9,1.916394,0.3337,0.904,0.1759
10,1.75911,0.9693,0.8046,0.9408


In [18]:
#
# Write selected features
#

qres = []
for i,j in zip([qsig, qsvc, qpca], ['sig', 'svc', 'pca']):
    tmp = pd.DataFrame(index=i)
    tmp[j] = True
    qres.append(tmp)

qres = reduce(lambda df1, df2: df1.join(df2, how='outer'), qres).fillna(False)

mres = []
for i,j in zip([msig, msvc, mpca], ['sig', 'svc', 'pca']):
    tmp = pd.DataFrame(index=i)
    tmp[j] = True
    mres.append(tmp)

mres = reduce(lambda df1, df2: df1.join(df2, how='outer'), mres).fillna(False)

In [19]:
qres.to_csv('qfilt.tsv', sep='\t', index=True)
mres.to_csv('mfilt.tsv', sep='\t', index=True)

In [20]:
# Check if filtered features were identified by Alessia...

mres
f2i = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\WorkingFiles\f2i.tsv", sep='\t')
alid = pd.read_excel(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\OriginalFiles\RBR_identifications.xlsx").loc[:,['Features code', 'ID', 'Apex m/z', 'RT [min]']]

tmp = pd.merge(
    f2i, alid,
    how='right',
    on=['Apex m/z', 'RT [min]']
    #left_on='Name', right_on='Features code'
).set_index('fid').join(mres, how='left')