In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

In [2]:
dir_path = './data/'

omics_dir = ['methylation.csv', 'expression.csv', 'cnv.csv', 'mutation.csv']
drug_dir = 'drug_response(IC50).csv'
data_omics = [pd.read_csv(dir_path + omics_dir[i]).set_index('Unnamed: 0') for i in range(len(omics_dir))]
data_drug = pd.read_csv(dir_path + 'drug_response(IC50).csv').set_index('Unnamed: 0')

# Overlapping set of samples

In [3]:
# get smaple list
sample_list = [data.index.to_list() for data in data_omics]
sample_list.append(data_drug.index.to_list())

# # Get sample common to all data & normalize all data in a range (MinMaxscalar)
sample_set = set(sample_list[0])
for i in range(1, len(sample_list)):
    sample_set = sample_set & set(sample_list[i])
for i in range(len(data_omics)):
    data_omics[i] = data_omics[i].loc[data_omics[i].index.isin(list(sample_set))].sort_index()
data_drug = data_drug.loc[data_drug.index.isin(list(sample_set))].sort_index()
for i in range(len(data_omics)):
    print(data_omics[i].shape)
data_drug.shape

(543, 20219)
(543, 19160)
(543, 21840)
(543, 445)


(543, 310)

# Filter samples with more than 30% missing values

In [4]:
def get_fewer_samples(data, threshold):
    data_copy = data.copy()
    index = []
    for i in range(len(data)):
        if data.iloc[i].isna().sum() > len(data.iloc[i]) * threshold:
            index.append(data.index[i])
    data_copy = data_copy.drop(index)
    return data_copy

def get_fewer_feature(data, threshold):
    data_copy = data.copy()
    for feature in data.columns:
        if data[feature].isna().sum() > len(data[feature]) * threshold:
            data_copy = data_copy.drop([feature], axis = 1)
    return data_copy

threshold = 0.3
data_omics = [get_fewer_feature(data_omics[i], threshold) for i in range(len(data_omics))]
data_drug = get_fewer_feature(data_drug, threshold)

for i in range(len(data_omics)):
    print(data_omics[i].shape)
data_drug.shape

(543, 19412)
(543, 19160)
(543, 21840)
(543, 445)


(543, 31)

# Delete all zero feature

In [5]:
def Variance_FS(data, thres):
    variance = VarianceThreshold(threshold = thres)
    variance.fit(data)
    columns = [column for column in data.columns if column not in data.columns[variance.get_support()]]
    return data.drop(labels = columns, axis = 1) 

data_omics = [Variance_FS(data_omics[i], 0) for i in range(len(data_omics))]
for i in range(len(data_omics)):
    print(data_omics[i].shape)
data_drug.shape

(543, 19412)
(543, 19144)
(543, 21840)
(543, 223)


(543, 31)

In [6]:
data_omics_des = [data_omics[i].describe() for i in range(len(data_omics))]
data_drug_des = data_drug.describe()

# Missing values are imputed as mean

In [7]:
data_omics = [data_omics[i].fillna(data_omics[i].mean()) for i in range(len(data_omics))]

In [8]:
import os
if not os.path.exists(dir_path + 'preprocessing'):
    os.makedirs(dir_path + 'preprocessing')
data_omics[0].to_csv(dir_path + 'preprocessing/methylation.csv')
data_omics[1].to_csv(dir_path + 'preprocessing/expression.csv')
data_omics[2].to_csv(dir_path + 'preprocessing/cnv.csv')
data_omics[3].to_csv(dir_path + 'preprocessing/mutation.csv')
data_drug.to_csv(dir_path + 'preprocessing/IC50(log).csv')