In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression

In [2]:
uni_feat_percent=30
occurrence_ratio = 0.90
try:
    os.mkdir("processed_data")
except OSError:
    print("Directory already exists")

Directory already exists


In [3]:
neg_ids = pd.read_csv("input_data/patient_ids_neg.csv").values[:, 0].astype(str)
pos_ids = pd.read_csv("input_data/patient_ids_pos.csv").values[:, 0].astype(str)

In [36]:
def filter_features(X, min_occurence_ratio):
    print("Original dimensions: {}".format(X.shape))
    # Filters out the features that are constant
    X = X.iloc[:,~(X == X.iloc[0,:]).all(axis = 0).values]
    print("Dimensions after filtering out constant features: {}".format(X.shape))
    # Filters out features that have more than min_occurence_ratio of n_subjects zero counts
    # Aka, the feature has to occur in at least min_occurence_ratio of the samples
    X = X.iloc[:, ((X == 0).sum(axis = 0) < X.shape[0] * min_occurence_ratio).values]
    print("Dimensions after filtering based on an occurence threshold of {}: {}".format(min_occurence_ratio, X.shape))
    # makes a new column with the sum of the values in each column of a row is <20
    # removing all rows that have less than 5% of counts more than 20
    return X

In [5]:
print('Data loading')
print('')
X_train_df=pd.read_csv(filepath_or_buffer='input_data/X_train.csv',
                       index_col = 0)
y_train_df=pd.read_csv(filepath_or_buffer = 'input_data/aTPO_no_meds.csv',
                       index_col='Heliusnr')
X_train_df.fillna(method='ffill',inplace=True)
print('Finished')

Data loading

Finished


In [9]:
X_train_df = X_train_df.transpose()
y_train_df = y_train_df.sort_index(axis = 0)

In [10]:
X_train_df_pos = X_train_df.loc[pos_ids]
X_train_df_neg = X_train_df.loc[neg_ids]
X_train_df_total = pd.concat([X_train_df_pos, X_train_df_neg], axis = 0)

In [38]:
X_train_df_filtered = filter_features(X_train_df_total, occurrence_ratio)

Original dimensions: (286, 21443)
Dimensions after filtering out constant features: (286, 14462)
Dimensions after filtering based on an occurence threshold of 0.9: (286, 1497)


In [40]:
y_binary = np.concatenate((np.zeros(143), np.ones(143)))

In [42]:
b = SelectPercentile(f_classif, uni_feat_percent)
b.fit(X_train_df_filtered, y_binary)
X_train_df_fs = X_train_df_filtered.iloc[:, b.get_support(indices=True)]
y = y_binary
print("Number of features after feature selection: {}".format(len(X_train_df_fs.iloc[0])))

Number of features after feature selection: 449




In [43]:
X_numpy = X_train_df_fs.values

In [48]:
np.save("processed_data/X.npy", X_numpy)
np.save("processed_data/y.npy", y)
X_train_df_fs.to_csv("processed_data/X.csv")

In [46]:
X_numpy

array([[ 3,  0,  4, ..., 75,  0,  0],
       [ 2,  1,  8, ..., 21,  0,  0],
       [ 0,  0,  0, ..., 20,  0,  1],
       ...,
       [ 0,  0,  0, ...,  6,  0,  0],
       [ 1,  0,  0, ..., 13,  0,  0],
       [ 0,  0, 36, ..., 22,  0,  0]], dtype=int64)

In [47]:
X_train_df_fs

Unnamed: 0,Zotu6706,Zotu547,Zotu85,Zotu77,Zotu96,Zotu170,Zotu18330,Zotu26,Zotu41,Zotu16290,...,Zotu57,Zotu760,Zotu389,Zotu568,Zotu18600,Zotu736,Zotu574,Zotu127,Zotu10619,Zotu10209
210006,3,0,4,1,0,28,0,27,11,0,...,56,0,2,0,0,0,0,75,0,0
147900,2,1,8,4,14,0,0,105,13,0,...,28,0,5,0,0,2,4,21,0,0
126264,0,0,0,27,95,75,0,11,105,0,...,25,0,5,0,0,0,1,20,0,1
220555,0,0,0,8,30,0,0,7,22,0,...,70,0,0,0,0,0,1,15,0,0
101565,0,0,24,12,4,0,0,34,1,0,...,91,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128288,0,9,5,1,140,0,3,114,1745,0,...,5,0,4,2,0,0,0,8,0,0
230268,1,0,0,0,11,0,0,11,51,0,...,222,0,1,0,1,0,0,20,0,2
228433,0,0,0,0,0,0,0,1,32,0,...,13,0,0,0,0,0,0,6,0,0
153854,1,0,0,27,6,0,0,28,29,0,...,52,0,0,0,0,0,0,13,0,0
