## Data import

In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('../data/dataset.tsv', sep='\t')

In [2]:
data = dataset.drop(columns=['age'], inplace=False)
labels = dataset['age']

In [4]:
data.describe()

Unnamed: 0,NM_173803,NM_014423,NM_001103167,NR_134623,NR_024490,NM_018397,NM_001037671,NR_106859,NM_133458,NM_001080424,...,NR_029666,NM_080390,NM_001177675,NR_125786,NM_001291993,NM_021923,NM_017417,NM_004354,NR_024278,NM_145282
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,0.124474,18.345647,0.414218,0.00197,1.006414,0.547263,0.0,27.668767,6.133226,1.655526,...,0.0,0.098594,0.003947,0.0,5.590233,23.029218,0.005812,6.852872,1.189158,0.002639
std,0.078508,3.426647,0.171433,0.01309,0.316317,0.808279,0.0,7.176748,1.397833,0.531323,...,0.0,0.209719,0.012712,0.0,1.114522,10.267254,0.013509,2.82179,0.450767,0.013269
min,0.0,9.253,0.042,0.0,0.505,0.014,0.0,9.928,3.503,0.518,...,0.0,0.0,0.0,0.0,3.285,2.703,0.0,1.505,0.392,0.0
25%,0.069,16.064,0.297,0.0,0.777,0.106,0.0,22.684,5.052,1.343,...,0.0,0.0,0.0,0.0,4.823,15.905,0.0,5.019,0.918,0.0
50%,0.108,17.884,0.4,0.0,0.946,0.218,0.0,27.585,5.961,1.593,...,0.0,0.046,0.0,0.0,5.642,23.009,0.0,6.177,1.067,0.0
75%,0.163,20.292,0.506,0.0,1.175,0.534,0.0,32.172,6.822,2.001,...,0.0,0.104,0.0,0.0,6.233,29.242,0.0,8.148,1.413,0.0
max,0.389,29.769,1.014,0.097,1.973,3.964,0.0,45.641,10.299,4.561,...,0.0,1.657,0.089,0.0,8.883,50.808,0.091,16.739,2.793,0.112


In [3]:
len(data.columns)

27142

## Removal of useless features

In [4]:
const_cols = data.columns[data.std() == 0.0]

In [5]:
data_non_zero_var = data.drop(columns=const_cols, inplace=False)

In [30]:
len(data.mean().values)

27142

## Setting a variance threshold 

In [21]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.01)
data_signif_var = var_thresh.fit_transform(data)

In [22]:
data_signif_var.shape

(133, 15849)

In [24]:
cols_signif_var = var_thresh.get_feature_names_out(data.columns)
cols_signif_var

array(['NM_014423', 'NM_001103167', 'NR_024490', ..., 'NM_021923',
       'NM_004354', 'NR_024278'], dtype=object)

## Using sklearn's feature selection

This will be more important during the next phase of work, where we (will) train models. These features can then be used to verify if the selection is indeed fruitful for the task at hand. 

In [55]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

feature_selector = SelectKBest(score_func=f_regression, k=50)
data_condensed = feature_selector.fit_transform(data_non_zero_var, labels)
feature_selector.get_feature_names_out(data_non_zero_var.columns)

array(['NM_147150', 'NM_005439', 'NR_033339', 'NM_139178', 'NR_028028',
       'NM_001310', 'NM_015713', 'NM_001101801', 'NM_012106', 'NM_002028',
       'NM_001146032', 'NR_037425', 'NM_019600', 'NM_002035', 'NM_005659',
       'NM_017789', 'NM_002952', 'NM_173647', 'NM_001278728', 'NM_006755',
       'NM_183352', 'NM_001102575', 'NM_001202559', 'NM_001284368',
       'NM_015055', 'NM_004547', 'NM_002311', 'NM_001199933',
       'NM_001317743', 'NM_016404', 'NM_006351', 'NM_003941',
       'NM_001134231', 'NM_138554', 'NM_014372', 'NM_175617', 'NM_016217',
       'NM_001136562', 'NM_001199662', 'NM_001300899', 'NM_014595',
       'NM_004911', 'NM_001145337', 'NM_015188', 'NM_006858', 'NM_001023',
       'NM_022470', 'NM_001287222', 'NM_030571', 'NM_000852'],
      dtype=object)

In [56]:
feature_selector = SelectKBest(score_func=mutual_info_regression, k=50)
data_condensed = feature_selector.fit_transform(data_non_zero_var, labels)
feature_selector.get_feature_names_out(data_non_zero_var.columns)

array(['NM_147150', 'NR_126526', 'NM_001310', 'NM_012106', 'NM_006848',
       'NM_198793', 'NM_018368', 'NM_001267548', 'NM_001321636',
       'NM_019600', 'NM_021076', 'NM_001134396', 'NR_030366',
       'NM_001278728', 'NM_006755', 'NM_001286581', 'NR_037661',
       'NM_001201479', 'NM_015229', 'NM_001288778', 'NM_030665',
       'NM_033285', 'NM_021228', 'NM_001136200', 'NM_014551',
       'NM_001278163', 'NM_013995', 'NM_001136562', 'NM_001300899',
       'NM_144564', 'NM_001007189', 'NM_005632', 'NM_014595',
       'NM_001293234', 'NM_145030', 'NM_012249', 'NM_014205',
       'NM_001142575', 'NM_014713', 'NM_001153', 'NM_002167',
       'NM_001017923', 'NM_004421', 'NM_012234', 'NM_015649', 'NM_022470',
       'NM_016255', 'NM_006114', 'NM_130787', 'NM_000852'], dtype=object)