In [16]:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
import scipy.stats as ss
# from collections import Counter
import math 
from scipy import stats
import numpy as np 
import pandas as pd

import seaborn as sns
sns.set(style="ticks")
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
flatui = sns.color_palette(flatui)

import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
# Configuration
num_feats = 30
data_file = '../data/Supp_Table_6_filtered_lfq_discovery.csv'

In [18]:
gene_df = pd.read_csv(data_file, sep=';', header=0)
gene_df = gene_df.drop(columns=['Razor + unique peptides', 'Unique peptides','Q-value', 
                      'Score', 'Intensity', 'MS/MS count', 'Protein IDs',
       'Majority protein IDs', 'Protein names', 'Column1', 'Column2',
       'Column3', 'Column4', 'Column5', 'Column6', 'Column7', 'Column8',
       'Column9', 'Column10', 'Column11', 'Column12'])
gene_df = gene_df.fillna(value="labels", limit=1)
gene_df = gene_df.set_index('Gene names')
gene_df = gene_df.transpose()

y = gene_df['labels']== 'Healthy'

gene_df = gene_df.iloc[:, :-1]
gene_df = gene_df.astype(float)

features = gene_df.columns
traindf = pd.DataFrame(gene_df,columns=features)
X = traindf.copy()
feature_name = list(X.columns)

#display(X)

In [19]:
# uniquify column names

from collections import Counter # Counter counts the number of occurrences of each item
from itertools import tee, count

def uniquify(seq, suffs = count(1)):
    """Make all the items unique by adding a suffix (1, 2, etc).

    `seq` is mutable sequence of strings.
    `suffs` is an optional alternative suffix iterable.
    """
    not_unique = [k for k,v in Counter(seq).items() if v>1] # so we have: ['name', 'zip']
    # suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
    suff_gens = dict(zip(not_unique, tee(suffs, len(not_unique))))  
    for idx,s in enumerate(seq):
        try:
            suffix = str(next(suff_gens[s]))
        except KeyError:
            # s was unique
            continue
        else:
            seq[idx] += suffix
            
cols = X.columns.tolist()
uniquify(cols)
X.columns = cols

In [20]:
# Pearson

def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        if y.ndim != X[i].ndim:
            print("ERROR")
            print(f"{y.ndim} & {X[i].ndim}")
            print(X[i])
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]

    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()

    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

30 selected features


In [21]:
# Chi squared

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

print(chi_feature)

30 selected features
['ASRGL1', 'BGN', 'CEACAM5', 'CILP', 'CLCA4', 'DPYD', 'DSG1', 'DSG3', 'FBLN12', 'FSTL1', 'GDA', 'HSPA2', 'IVL', 'KRIT1', 'LGALS7', 'MIF', 'NNMT', 'PCBD1', 'PKHD1L1', 'PPIA', 'RHPN2', 'S100A14', 'S100A2', 'S100P', 'SERPINB13', 'SERPINB5', 'SFN', 'SPRR3', 'TPPP3', 'USP9X']


In [22]:
# Recursive feature elemination

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 1658 features.
Fitting estimator with 1648 features.
Fitting estimator with 1638 features.
Fitting estimator with 1628 features.
Fitting estimator with 1618 features.
Fitting estimator with 1608 features.
Fitting estimator with 1598 features.
Fitting estimator with 1588 features.
Fitting estimator with 1578 features.
Fitting estimator with 1568 features.
Fitting estimator with 1558 features.
Fitting estimator with 1548 features.
Fitting estimator with 1538 features.
Fitting estimator with 1528 features.
Fitting estimator with 1518 features.
Fitting estimator with 1508 features.
Fitting estimator with 1498 features.
Fitting estimator with 1488 features.
Fitting estimator with 1478 features.
Fitting estimator with 1468 features.
Fitting estimator with 1458 features.
Fitting estimator with 1448 features.
Fitting estimator with 1438 features.
Fitting estimator with 1428 features.
Fitting estimator with 1418 features.
Fitting estimator with 1408 features.
Fitting esti

In [23]:
# Lasso : select from model

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

7 selected features


In [24]:
# Random Forrest

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

30 selected features


In [25]:
# Light GBM / XGBoost

from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

30 selected features


In [26]:
# all together

# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})

# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)

# display the beast features
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

  return reduction(axis=axis, out=out, **passkwargs)


Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,S100A14,True,True,True,True,True,False,5
2,NNMT,True,True,True,True,True,False,5
3,MIF,True,True,True,True,True,False,5
4,SPRR3,True,True,True,False,True,False,4
5,SERPINB5,True,True,True,True,False,False,4
6,S100A2,True,True,True,True,False,False,4
7,IVL,True,True,True,False,True,False,4
8,CEACAM5,True,True,True,False,True,False,4
9,ASRGL1,True,True,True,False,True,False,4
10,TPPP3,False,True,True,False,True,False,3


In [27]:
nine_protein_classifier = ['ENPP3', 'IVL', 'S100A2', 'MYH11', 'SERPINB5', 'NNMT', 'CLCA4', 'CD109', 'S100A14']
features = feature_selection_df['Feature'].tolist()

print(f"Classifier from paper: {nine_protein_classifier}")
print(f"My classifier: {features[:9]}")

print(f"Genes that are equal in Paper and my suggestion: {set(nine_protein_classifier) & set(features[:9])}")



Classifier from paper: ['ENPP3', 'IVL', 'S100A2', 'MYH11', 'SERPINB5', 'NNMT', 'CLCA4', 'CD109', 'S100A14']
My classifier: ['S100A14', 'NNMT', 'MIF', 'SPRR3', 'SERPINB5', 'S100A2', 'IVL', 'CEACAM5', 'ASRGL1']
Genes that are equal in Paper and my suggestion: {'S100A14', 'S100A2', 'IVL', 'SERPINB5', 'NNMT'}
