## Data import

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('../data/dataset.tsv', sep='\t')

In [2]:
data = dataset.drop(columns=['age'], inplace=False)
labels = dataset['age']

In [4]:
data.describe()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NM_001037671,NR_106859,NM_133458,NM_001080424,...,NR_029666,NM_080390,NM_001177675,NR_125786,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.124474,18.345647,0.414218,0.00197,1.006414,0.547263,0.0,27.668767,6.133226,1.655526,...,0.0,0.098594,0.003947,0.0,5.590233,23.029218,0.005812,6.852872,1.189158,0.002639
std,0.078508,3.426647,0.171433,0.01309,0.316317,0.808279,0.0,7.176748,1.397833,0.531323,...,0.0,0.209719,0.012712,0.0,1.114522,10.267254,0.013509,2.82179,0.450767,0.013269
min,0.0,9.253,0.042,0.0,0.505,0.014,0.0,9.928,3.503,0.518,...,0.0,0.0,0.0,0.0,3.285,2.703,0.0,1.505,0.392,0.0
25%,0.069,16.064,0.297,0.0,0.777,0.106,0.0,22.684,5.052,1.343,...,0.0,0.0,0.0,0.0,4.823,15.905,0.0,5.019,0.918,0.0
50%,0.108,17.884,0.4,0.0,0.946,0.218,0.0,27.585,5.961,1.593,...,0.0,0.046,0.0,0.0,5.642,23.009,0.0,6.177,1.067,0.0
75%,0.163,20.292,0.506,0.0,1.175,0.534,0.0,32.172,6.822,2.001,...,0.0,0.104,0.0,0.0,6.233,29.242,0.0,8.148,1.413,0.0
max,0.389,29.769,1.014,0.097,1.973,3.964,0.0,45.641,10.299,4.561,...,0.0,1.657,0.089,0.0,8.883,50.808,0.091,16.739,2.793,0.112


In [5]:
len(data.columns)

27142

## Removal of useless features

In [3]:
const_cols = data.columns[data.std() == 0.0]

In [4]:
data_non_zero_var = data.drop(columns=const_cols, inplace=False)

In [5]:
len(data_non_zero_var.columns)

23091

## Log transform

In [12]:
data_non_zero_var_no_zeros = data_non_zero_var.replace(0, 1e4)

In [14]:
data_non_zero_var_no_zeros.min().min() 

0.001

Minimum non-zero value in the dataframe is 1e-3. So let us add an amount of 1e-5 before log transform

In [15]:
data_log_transform = np.log2(data_non_zero_var + 1e-5)
data_log_transform.head()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NR_106859,NM_133458,NM_001080424,NM_001271816,...,NM_022495,NR_027141,NM_080390,NM_001177675,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
0,-2.910393,4.085765,-1.369557,-16.60964,-0.675742,-3.120169,4.978699,2.459434,0.387374,1.099639,...,4.094321,2.676947,-16.60964,-16.60964,2.120687,4.869971,-4.643496,2.618006,-0.921363,-16.60964
1,-2.231007,4.067984,-2.573381,-16.60964,-0.512493,-3.69881,4.806943,2.694437,0.440431,0.96496,...,3.829343,2.815577,-16.60964,-16.60964,2.496209,4.980071,-16.60964,2.486202,0.137517,-16.60964
2,-4.21063,4.018457,-1.286269,-16.60964,0.182705,-1.815986,4.695716,2.759158,0.580155,0.911508,...,3.912363,2.711497,-4.755941,-16.60964,2.61636,5.441152,-16.60964,2.59216,-0.040957,-16.60964
3,-3.921172,3.939698,-2.756233,-16.60964,-0.545803,-1.717809,4.230819,2.529324,0.755323,0.946364,...,3.832587,2.484914,-16.60964,-5.505697,1.907664,5.299062,-16.60964,2.335715,0.546966,-16.60964
4,-2.68029,4.076645,-1.988447,-3.473771,-0.31113,-3.411042,4.207972,2.336857,0.468854,1.49314,...,3.816396,2.546959,-3.457831,-16.60964,2.272922,4.216611,-16.60964,1.923723,0.308023,-16.60964


## Removing correlated features

In [16]:
corr_matrix = data_log_transform.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_cols = [column for column in upper.columns if any(upper[column] > 0.95)]

In [8]:
corr_cols

['NM_018055',
 'NM_178556',
 'NM_014702',
 'NR_110251',
 'NM_000517',
 'NM_080668',
 'NR_046536',
 'NM_001858',
 'NM_001142557',
 'NR_021489',
 'NM_012400',
 'NM_001004686',
 'NM_001004484',
 'NM_001286725',
 'NR_138471',
 'NM_001190907',
 'NR_132987',
 'NM_007317',
 'NM_001310155',
 'NM_005030',
 'NM_002105',
 'NM_001067',
 'NM_001100391',
 'NM_001286378',
 'NM_206886',
 'NR_038395',
 'NM_001004685',
 'NM_014070',
 'NM_001040694',
 'NM_001345926',
 'NR_031698',
 'NM_182681',
 'NM_001297655',
 'NR_104621',
 'NM_001134888',
 'NM_182508',
 'NR_125902',
 'NR_003923',
 'NM_005420',
 'NM_022809',
 'NR_034161',
 'NR_038331',
 'NM_005913',
 'NM_017779',
 'NM_001305792',
 'NM_199127',
 'NM_003026',
 'NM_001038640',
 'NM_001098173',
 'NR_031753',
 'NR_126008',
 'NM_001308470',
 'NM_001276351',
 'NM_001166131',
 'NM_022346',
 'NR_109777',
 'NM_014865',
 'NR_128706',
 'NM_001195388',
 'NM_001080407',
 'NR_046756',
 'NR_036580',
 'NM_001286123',
 'NR_030411',
 'NR_030361',
 'NM_003035',
 'NM_00100

In [17]:
len(corr_cols)

1608

In [18]:
data_uncorr = data_log_transform.drop(columns=corr_cols, inplace=False)
len(data_uncorr.columns)

21483

## MinMax scaling

In [19]:
data_uncorr_min = data_uncorr.min()
data_uncorr_max = data_uncorr.max()

data_uncorr_minmax = (data_uncorr - data_uncorr_min)/(data_uncorr_max - data_uncorr_min)
data_uncorr_minmax.describe()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NR_106859,NM_133458,NM_001080424,NM_001271816,...,NM_022495,NR_027141,NM_080390,NM_001177675,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.845681,0.570988,0.689957,0.022284,0.472448,0.511688,0.648298,0.496459,0.511881,0.636003,...,0.5644,0.460483,0.466053,0.099867,0.514268,0.688468,0.175175,0.596859,0.532424,0.041645
std,0.177776,0.160495,0.143315,0.147259,0.220975,0.218995,0.181902,0.205904,0.145155,0.208194,...,0.174068,0.191817,0.378868,0.281645,0.204026,0.185279,0.348955,0.164833,0.182008,0.19248
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.836371,0.472078,0.614323,0.0,0.316183,0.35849,0.541678,0.339535,0.43795,0.484321,...,0.478829,0.314848,0.0,0.0,0.386047,0.604109,0.0,0.499984,0.43335,0.0
50%,0.878758,0.563926,0.707837,0.0,0.460599,0.48621,0.66991,0.492957,0.516427,0.664802,...,0.570956,0.425991,0.701787,0.0,0.543714,0.729975,0.0,0.586164,0.509948,0.0
75%,0.917702,0.672029,0.78167,0.0,0.619675,0.644905,0.770749,0.61806,0.621252,0.81446,...,0.675168,0.554009,0.769654,0.0,0.643857,0.811688,0.0,0.701127,0.652983,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Variance thresholding

In [20]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.01)
data_signif_var = var_thresh.fit_transform(data_uncorr)
data_signif_var.shape

(133, 21483)

In [18]:
data_signif_var_cols = var_thresh.get_feature_names_out(data_uncorr.columns)
data_signif_var_cols

array(['NM_014423', 'NM_001103167', 'NR_024490', ..., 'NM_021923',
       'NM_004354', 'NR_024278'], dtype=object)

In [21]:
data_minmax_signif_var = var_thresh.fit_transform(data_uncorr_minmax)
data_minmax_signif_var.shape

(133, 21080)

## Using sklearn's feature selection

This will be more important during the next phase of work, where we (will) train models. These features can then be used to verify if the selection is indeed fruitful for the task at hand. 

In [32]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

feature_selector = SelectKBest(score_func=f_regression, k=1000)
data_condensed = feature_selector.fit_transform(data_uncorr_minmax, labels)
cols_condensed = feature_selector.get_feature_names_out(data_uncorr_minmax.columns)
cols_condensed

array(['NM_014423', 'NM_018397', 'NM_015603', 'NR_134896', 'NM_147150',
       'NM_030791', 'NM_021216', 'NM_001251974', 'NM_030587',
       'NM_001143914', 'NM_006110', 'NM_005439', 'NM_001286810',
       'NM_080626', 'NM_005099', 'NM_006428', 'NM_001168347', 'NM_001233',
       'NR_015449', 'NM_001322212', 'NM_005483', 'NR_126526',
       'NM_001195125', 'NM_015950', 'NM_021964', 'NM_002180', 'NM_014856',
       'NM_005834', 'NM_016061', 'NM_032112', 'NM_001135664', 'NR_031592',
       'NR_033339', 'NM_003713', 'NM_030648', 'NM_175854', 'NM_001203261',
       'NM_001919', 'NM_018178', 'NM_021807', 'NM_001329238', 'NM_021999',
       'NM_001024957', 'NM_175573', 'NM_172232', 'NM_172251', 'NM_004124',
       'NM_001319143', 'NM_002105', 'NM_139178', 'NM_007104', 'NM_053001',
       'NM_145251', 'NM_152230', 'NM_003213', 'NM_018946', 'NM_001288580',
       'NM_018216', 'NM_001826', 'NR_028028', 'NM_001031746',
       'NM_001023563', 'NM_001164239', 'NM_002076', 'NM_032430',
       'NM_0

In [56]:
feature_selector = SelectKBest(score_func=mutual_info_regression, k=50)
data_condensed = feature_selector.fit_transform(data_non_zero_var, labels)
feature_selector.get_feature_names_out(data_non_zero_var.columns)

array(['NM_147150', 'NR_126526', 'NM_001310', 'NM_012106', 'NM_006848',
       'NM_198793', 'NM_018368', 'NM_001267548', 'NM_001321636',
       'NM_019600', 'NM_021076', 'NM_001134396', 'NR_030366',
       'NM_001278728', 'NM_006755', 'NM_001286581', 'NR_037661',
       'NM_001201479', 'NM_015229', 'NM_001288778', 'NM_030665',
       'NM_033285', 'NM_021228', 'NM_001136200', 'NM_014551',
       'NM_001278163', 'NM_013995', 'NM_001136562', 'NM_001300899',
       'NM_144564', 'NM_001007189', 'NM_005632', 'NM_014595',
       'NM_001293234', 'NM_145030', 'NM_012249', 'NM_014205',
       'NM_001142575', 'NM_014713', 'NM_001153', 'NM_002167',
       'NM_001017923', 'NM_004421', 'NM_012234', 'NM_015649', 'NM_022470',
       'NM_016255', 'NM_006114', 'NM_130787', 'NM_000852'], dtype=object)

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

simple_estimator = LinearRegression()

In [34]:
feature_selector = RFE(estimator=simple_estimator, n_features_to_select=50, step=0.1)
data_condensed_rec = feature_selector.fit_transform(data_uncorr_minmax, labels)
feature_selector.get_feature_names_out(data_uncorr_minmax.columns)

array(['NR_038870', 'NM_004666', 'NM_182970', 'NM_001244752',
       'NM_001271805', 'NR_038912', 'NR_108106', 'NR_027755',
       'NM_001317942', 'NR_029379', 'NM_021018', 'NM_152996', 'NR_109875',
       'NR_132979', 'NM_021193', 'NM_180990', 'NR_120503', 'NR_027374',
       'NR_029378', 'NM_001321827', 'NR_002833', 'NR_003031', 'NR_036251',
       'NM_153038', 'NM_001001671', 'NR_135674', 'NR_135521',
       'NM_001159522', 'NM_019618', 'NR_106914', 'NM_015363', 'NR_120360',
       'NM_005320', 'NM_001672', 'NM_000624', 'NM_001098475', 'NM_181643',
       'NM_014212', 'NM_020997', 'NR_003716', 'NM_001277348', 'NR_038342',
       'NR_027330', 'NM_021046', 'NR_036444', 'NR_027508', 'NR_104164',
       'NR_051984', 'NM_000898', 'NM_001242750'], dtype=object)