## Pre-processing
Reads (currently) muscle and thryoid expression matrices, cleans them, train-test split (80-20), grabs top-1000 genes (univariate correlations in training) with label (age)

In [40]:
import numpy as np
import pandas as pd
import sys
import argparse
import matplotlib.pyplot as plt 
%matplotlib inline
plt.style.use('ggplot')
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import os
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold
from sklearn.metrics import mean_squared_error
import math
cur_path = os.getcwd()

In [52]:
parser = argparse.ArgumentParser(description='Process display arguments')
parser.add_argument("-f", "--jupyter-json")
parser.add_argument("-muscle-file", "--muscle-file", default=cur_path+"/data/GTEx_Analysis_v7_eQTL_expression_matrices/Muscle_Skeletal.v7.normalized_expression.bed")
parser.add_argument("-thyroid-file", "--thyroid-file", default=cur_path+"/data/GTEx_Analysis_v7_eQTL_expression_matrices/Thyroid.v7.normalized_expression.bed")
parser.add_argument("-label-file", "--label-file", default=cur_path+"/data/GTEx_v7_Annotations_SubjectPhenotypesDS.txt")
parser.add_argument("-output-dir", "--output-dir", default=cur_path+"/processed_data")
parser.add_argument("-phenotypes-file", "--phenotypes-file", default=cur_path+"/data/GTEx_v7_Annotations_SubjectPhenotypesDS.txt")
args = parser.parse_args()
if not os.path.exists(args.output_dir):
    os.mkdir(outdir)

In [42]:
def gen_matrix(file_path):
    df = pd.read_csv(file_path, header=0, sep='\t', dtype=str)
    df.drop(["#chr", "start", "end"], axis=1, inplace=True)
    df.set_index("gene_id", inplace=True)
    df = df.transpose()
    df.columns.name = None
    return df.apply(pd.to_numeric)

muscle_df = gen_matrix(args.muscle_file)
thyroid_df = gen_matrix(args.thyroid_file)

In [19]:
inter_instances = list(set(muscle_df.index)  & set(thyroid_df.index))
inter_features = list(set(muscle_df.columns)  & set(thyroid_df.columns))
muscle_df = muscle_df[inter_features].loc[inter_instances]
thyroid_df = thyroid_df[inter_features].loc[inter_instances]

In [46]:
train_muscle, test_muscle = train_test_split(muscle_df, test_size=0.2, shuffle=False)
train_thyroid, test_thyroid = train_test_split(thyroid_df, test_size=0.2, shuffle=False)

In [71]:
labels = pd.read_csv(args.phenotypes_file, header=0, sep='\t', dtype=str)
labels = labels.set_index("SUBJID").drop(["SEX", "DTHHRDY"], axis=1)
labels.index.name = None
labels["AGE"] = labels["AGE"].apply(lambda x: int(x[0:1]))
labels = labels.loc[inter_instances]

In [146]:
def select_features(train_x, train_y):
    selector = SelectKBest(f_regression, k=517)
    selector.fit(train_x, train_y.values.ravel())
    col_indices = selector.get_support(indices=True)
    return col_indices

muscle_features = select_features(train_muscle, labels.loc[train_muscle.index])
thyroid_features = select_features(train_thyroid, labels.loc[train_thyroid.index])
f_features = list(set(muscle_features) | set(thyroid_features))

In [153]:
muscle_df[muscle_df.columns[f_features]].to_csv(outdir + "/full_muscle.csv")
train_muscle[train_muscle.columns[f_features]].to_csv(outdir + "/train_muscle.csv")
test_muscle[test_muscle.columns[f_features]].to_csv(outdir + "/test_muscle.csv")

thyroid_df[thyroid_df.columns[f_features]].to_csv(outdir + "/full_thyroid.csv")
train_thyroid[train_thyroid.columns[f_features]].to_csv(outdir + "/train_thyroid.csv")
test_thyroid[test_thyroid.columns[f_features]].to_csv(outdir + "/test_thyroid.csv")

labels.to_csv(outdir + "/labels.csv")