In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif, f_regression

In [2]:
uni_feat_percent=30
occurrence_ratio = 0.90
try:
    os.mkdir("processed_data")
except OSError:
    print("Directory already exists")

Directory already exists


In [3]:
neg_ids = pd.read_csv("input_data/patient_ids_neg.csv").values[:, 0].astype(str)
pos_ids = pd.read_csv("input_data/patient_ids_pos.csv").values[:, 0].astype(str)

In [4]:
def filter_features(X, min_occurence_ratio):
    print("Original dimensions: {}".format(X.shape))
    # Filters out the features that are constant
    X = X[:, np.invert(np.all(X == X[0,:], axis = 0))]
    print("Dimensions after filtering out constant features: {}".format(X.shape))
    # Filters out features that have more than min_occurence_ratio of n_subjects zero counts
    # Aka, the feature has to occur in at least min_occurence_ratio of the samples
    X = X[:, (X == 0).sum(axis =0 ) < X.shape[0] * min_occurence_ratio]
    print("Dimensions after filtering based on an occurence threshold of {}: {}".format(min_occurence_ratio, X.shape))
    # makes a new column with the sum of the values in each column of a row is <20
    # removing all rows that have less than 5% of counts more than 20
    return X

In [5]:
print('Data loading')
print('')
X_train_df=pd.read_csv(filepath_or_buffer='input_data/X_train.csv',
                       index_col = 0)
y_train_df=pd.read_csv(filepath_or_buffer = 'input_data/aTPO_no_meds.csv',
                       index_col='Heliusnr')
X_train_df.fillna(method='ffill',inplace=True)
print('Finished')

Data loading

Finished


In [6]:
X_train_df = X_train_df.transpose()
y_train_df = y_train_df.sort_index(axis = 0)

In [7]:
X_train_df_pos = X_train_df.loc[pos_ids]
X_train_df_neg = X_train_df.loc[neg_ids]
X_train_df_total = pd.concat([X_train_df_pos, X_train_df_neg], axis = 0)

In [8]:
y_binary = np.concatenate((np.zeros(143), np.ones(143)))

In [9]:
X_full = X_train_df_total.values

In [10]:
# Filtering out features
X_full = filter_features(X_full, occurrence_ratio)

Original dimensions: (286, 21443)
Dimensions after filtering out constant features: (286, 14462)
Dimensions after filtering based on an occurence threshold of 0.9: (286, 1497)


In [11]:
selected_features = {}
selected_features["Features"] = []
selected_features["Scores"] = []
selected_features["p_values"] = []

list_feat_names = []
list_feat_importances = []

auc_dict = {}
auc_dict["TPR"] = []
auc_dict["FPR"] = []
auc_dict["Thresholds"] = []
auc_dict["AUC"] = []

In [12]:
b = SelectPercentile(f_classif,uni_feat_percent)
b.fit(X_full, y_binary)
X = X_full[:, b.get_support(indices=True)]
y = y_binary
selected_features["Features"] = X_train_df_total.columns[b.get_support(indices=True)]
feat_names = selected_features["Features"]
selected_features["Scores"] = b.scores_[b.get_support(indices=True)]
selected_features["p_values"] = b.pvalues_[b.get_support(indices=True)]
print("Number of features after feature selection: {}".format(len(X[0])))

Number of features after feature selection: 449




In [15]:
X_train_df = X_train_df_total.iloc[:, b.get_support(indices=True)]

In [17]:
np.save("processed_data/X.npy", X)
np.save("processed_data/y.npy", y)
X_train_df.to_csv("processed_data/X.csv")