## Data import

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('../data/dataset.tsv', sep='\t')

In [2]:
data = dataset.drop(columns=['age'], inplace=False)
labels = dataset['age']

In [4]:
data.describe()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NM_001037671,NR_106859,NM_133458,NM_001080424,...,NR_029666,NM_080390,NM_001177675,NR_125786,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.124474,18.345647,0.414218,0.00197,1.006414,0.547263,0.0,27.668767,6.133226,1.655526,...,0.0,0.098594,0.003947,0.0,5.590233,23.029218,0.005812,6.852872,1.189158,0.002639
std,0.078508,3.426647,0.171433,0.01309,0.316317,0.808279,0.0,7.176748,1.397833,0.531323,...,0.0,0.209719,0.012712,0.0,1.114522,10.267254,0.013509,2.82179,0.450767,0.013269
min,0.0,9.253,0.042,0.0,0.505,0.014,0.0,9.928,3.503,0.518,...,0.0,0.0,0.0,0.0,3.285,2.703,0.0,1.505,0.392,0.0
25%,0.069,16.064,0.297,0.0,0.777,0.106,0.0,22.684,5.052,1.343,...,0.0,0.0,0.0,0.0,4.823,15.905,0.0,5.019,0.918,0.0
50%,0.108,17.884,0.4,0.0,0.946,0.218,0.0,27.585,5.961,1.593,...,0.0,0.046,0.0,0.0,5.642,23.009,0.0,6.177,1.067,0.0
75%,0.163,20.292,0.506,0.0,1.175,0.534,0.0,32.172,6.822,2.001,...,0.0,0.104,0.0,0.0,6.233,29.242,0.0,8.148,1.413,0.0
max,0.389,29.769,1.014,0.097,1.973,3.964,0.0,45.641,10.299,4.561,...,0.0,1.657,0.089,0.0,8.883,50.808,0.091,16.739,2.793,0.112


In [3]:
len(data.columns)

27142

## Removal of useless features

In [4]:
const_cols = data.columns[data.std() == 0.0]

In [5]:
data_non_zero_var = data.drop(columns=const_cols, inplace=False)

In [11]:
len(data_non_zero_var.columns)

23091

## Setting a variance threshold 

In [21]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.01)
data_signif_var = var_thresh.fit_transform(data)

In [22]:
data_signif_var.shape

(133, 15849)

In [24]:
cols_signif_var = var_thresh.get_feature_names_out(data.columns)
cols_signif_var

array(['NM_014423', 'NM_001103167', 'NR_024490', ..., 'NM_021923',
       'NM_004354', 'NR_024278'], dtype=object)

## Removing correlated features

In [7]:
corr_matrix = data_non_zero_var.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_cols = [column for column in upper.columns if any(upper[column] > 0.95)]

In [8]:
corr_cols

['NM_018055',
 'NM_178556',
 'NM_014702',
 'NR_110251',
 'NM_000517',
 'NM_080668',
 'NR_046536',
 'NM_001858',
 'NM_001142557',
 'NR_021489',
 'NM_012400',
 'NM_001004686',
 'NM_001004484',
 'NM_001286725',
 'NR_138471',
 'NM_001190907',
 'NR_132987',
 'NM_007317',
 'NM_001310155',
 'NM_005030',
 'NM_002105',
 'NM_001067',
 'NM_001100391',
 'NM_001286378',
 'NM_206886',
 'NR_038395',
 'NM_001004685',
 'NM_014070',
 'NM_001040694',
 'NM_001345926',
 'NR_031698',
 'NM_182681',
 'NM_001297655',
 'NR_104621',
 'NM_001134888',
 'NM_182508',
 'NR_125902',
 'NR_003923',
 'NM_005420',
 'NM_022809',
 'NR_034161',
 'NR_038331',
 'NM_005913',
 'NM_017779',
 'NM_001305792',
 'NM_199127',
 'NM_003026',
 'NM_001038640',
 'NM_001098173',
 'NR_031753',
 'NR_126008',
 'NM_001308470',
 'NM_001276351',
 'NM_001166131',
 'NM_022346',
 'NR_109777',
 'NM_014865',
 'NR_128706',
 'NM_001195388',
 'NM_001080407',
 'NR_046756',
 'NR_036580',
 'NM_001286123',
 'NR_030411',
 'NR_030361',
 'NM_003035',
 'NM_00100

In [9]:
len(corr_cols)

1605

In [10]:
data_uncorr = data_non_zero_var.drop(columns=corr_cols, inplace=False)
len(data_uncorr.columns)

21486

## MinMax scaling

In [14]:
data_uncorr_min = data_uncorr.min()
data_uncorr_max = data_uncorr.max()

data_uncorr_minmax = (data_uncorr - data_uncorr_min)/(data_uncorr_max - data_uncorr_min)
data_uncorr_minmax.describe()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NR_106859,NM_133458,NM_001080424,NM_001271816,...,NM_022495,NR_027141,NM_080390,NM_001177675,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.319984,0.443198,0.38294,0.020309,0.341562,0.135003,0.496759,0.387026,0.281357,0.498178,...,0.44007,0.335555,0.059501,0.044352,0.411796,0.422539,0.063868,0.351048,0.332011,0.023563
std,0.201821,0.167023,0.176371,0.134948,0.215475,0.204628,0.200956,0.205685,0.131418,0.225383,...,0.173487,0.188993,0.126566,0.142832,0.199093,0.213434,0.148454,0.18523,0.187741,0.118473
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.177378,0.331985,0.262346,0.0,0.185286,0.023291,0.357181,0.227928,0.204056,0.312448,...,0.339788,0.195061,0.0,0.0,0.274741,0.274441,0.0,0.230668,0.219075,0.0
50%,0.277635,0.420696,0.368313,0.0,0.300409,0.051646,0.494414,0.361683,0.265892,0.497494,...,0.429577,0.284113,0.027761,0.0,0.421043,0.422118,0.0,0.306682,0.281133,0.0
75%,0.419023,0.538068,0.477366,0.0,0.456403,0.131646,0.622854,0.488376,0.366807,0.692147,...,0.543346,0.403015,0.062764,0.0,0.526617,0.551689,0.0,0.436064,0.425239,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Variance thresholding

In [15]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.01)
data_signif_var = var_thresh.fit_transform(data_uncorr)
data_signif_var.shape

(133, 15330)

In [18]:
data_signif_var_cols = var_thresh.get_feature_names_out(data_uncorr.columns)
data_signif_var_cols

array(['NM_014423', 'NM_001103167', 'NR_024490', ..., 'NM_021923',
       'NM_004354', 'NR_024278'], dtype=object)

In [19]:
data_minmax_signif_var = var_thresh.fit_transform(data_uncorr_minmax)
data_minmax_signif_var.shape

(133, 21034)

## Using sklearn's feature selection

This will be more important during the next phase of work, where we (will) train models. These features can then be used to verify if the selection is indeed fruitful for the task at hand. 

In [55]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

feature_selector = SelectKBest(score_func=f_regression, k=50)
data_condensed = feature_selector.fit_transform(data_non_zero_var, labels)
feature_selector.get_feature_names_out(data_non_zero_var.columns)

array(['NM_147150', 'NM_005439', 'NR_033339', 'NM_139178', 'NR_028028',
       'NM_001310', 'NM_015713', 'NM_001101801', 'NM_012106', 'NM_002028',
       'NM_001146032', 'NR_037425', 'NM_019600', 'NM_002035', 'NM_005659',
       'NM_017789', 'NM_002952', 'NM_173647', 'NM_001278728', 'NM_006755',
       'NM_183352', 'NM_001102575', 'NM_001202559', 'NM_001284368',
       'NM_015055', 'NM_004547', 'NM_002311', 'NM_001199933',
       'NM_001317743', 'NM_016404', 'NM_006351', 'NM_003941',
       'NM_001134231', 'NM_138554', 'NM_014372', 'NM_175617', 'NM_016217',
       'NM_001136562', 'NM_001199662', 'NM_001300899', 'NM_014595',
       'NM_004911', 'NM_001145337', 'NM_015188', 'NM_006858', 'NM_001023',
       'NM_022470', 'NM_001287222', 'NM_030571', 'NM_000852'],
      dtype=object)

In [56]:
feature_selector = SelectKBest(score_func=mutual_info_regression, k=50)
data_condensed = feature_selector.fit_transform(data_non_zero_var, labels)
feature_selector.get_feature_names_out(data_non_zero_var.columns)

array(['NM_147150', 'NR_126526', 'NM_001310', 'NM_012106', 'NM_006848',
       'NM_198793', 'NM_018368', 'NM_001267548', 'NM_001321636',
       'NM_019600', 'NM_021076', 'NM_001134396', 'NR_030366',
       'NM_001278728', 'NM_006755', 'NM_001286581', 'NR_037661',
       'NM_001201479', 'NM_015229', 'NM_001288778', 'NM_030665',
       'NM_033285', 'NM_021228', 'NM_001136200', 'NM_014551',
       'NM_001278163', 'NM_013995', 'NM_001136562', 'NM_001300899',
       'NM_144564', 'NM_001007189', 'NM_005632', 'NM_014595',
       'NM_001293234', 'NM_145030', 'NM_012249', 'NM_014205',
       'NM_001142575', 'NM_014713', 'NM_001153', 'NM_002167',
       'NM_001017923', 'NM_004421', 'NM_012234', 'NM_015649', 'NM_022470',
       'NM_016255', 'NM_006114', 'NM_130787', 'NM_000852'], dtype=object)