# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import copy
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import LeaveOneGroupOut
pd.options.mode.chained_assignment = None
# pickle_out = open("datasets_cleaned.pkl","wb")
# pickle.dump(datasets1, pickle_out)                #useful for writing data
# pickle_out.close()
def load_obj(name ):                    
    with open('' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Loading Data

In [2]:
filenames = ['cellfreerna.csv','plasmaluminex.csv','serumluminex.csv','microbiome.csv','immunesystem.csv','metabolomics.csv','plasmasomalogic.csv']

In [3]:
datasets = {}
for file in filenames:
    data = pd.read_csv(file)
    data.rename(columns={'Unnamed: 0':'PatientID'},inplace=True)
    data = data.set_index('PatientID', drop=True)
    if 'response' not in datasets.keys():
        datasets['response'] = data.featureweeks
    data = data.drop('featureweeks', axis=1)
    datasets[file[0:file.find('.')]] = data
    print('Successfully loaded the', file[0:file.find('.')], 'dataset' )

Successfully loaded the cellfreerna dataset
Successfully loaded the plasmaluminex dataset
Successfully loaded the serumluminex dataset
Successfully loaded the microbiome dataset
Successfully loaded the immunesystem dataset
Successfully loaded the metabolomics dataset
Successfully loaded the plasmasomalogic dataset


A while after initially exploring the data I noticed that some datasets have different indices. The index is formatted as PTLG00X_1 through PTLG00X_4 for most datasets, where X represents the patient's id and the second number represents the trimester. However for other datasets it is formatted as PTLG00X_BL, PTLG00X_1 through PTLG00X_3. 

In [4]:
datasets['cellfreerna'].index

Index(['PTLG002_1', 'PTLG003_1', 'PTLG004_1', 'PTLG005_1', 'PTLG007_1',
       'PTLG008_1', 'PTLG009_1', 'PTLG010_1', 'PTLG012_1', 'PTLG015_1',
       'PTLG016_1', 'PTLG018_1', 'PTLG019_1', 'PTLG020_1', 'PTLG022_1',
       'PTLG024_1', 'PTLG029_1', 'PTLG002_2', 'PTLG003_2', 'PTLG004_2',
       'PTLG005_2', 'PTLG007_2', 'PTLG008_2', 'PTLG009_2', 'PTLG010_2',
       'PTLG012_2', 'PTLG015_2', 'PTLG016_2', 'PTLG018_2', 'PTLG019_2',
       'PTLG020_2', 'PTLG022_2', 'PTLG024_2', 'PTLG029_2', 'PTLG002_3',
       'PTLG003_3', 'PTLG004_3', 'PTLG005_3', 'PTLG007_3', 'PTLG008_3',
       'PTLG009_3', 'PTLG010_3', 'PTLG012_3', 'PTLG015_3', 'PTLG016_3',
       'PTLG018_3', 'PTLG019_3', 'PTLG020_3', 'PTLG022_3', 'PTLG024_3',
       'PTLG029_3', 'PTLG002_4', 'PTLG003_4', 'PTLG004_4', 'PTLG005_4',
       'PTLG007_4', 'PTLG008_4', 'PTLG009_4', 'PTLG010_4', 'PTLG012_4',
       'PTLG015_4', 'PTLG016_4', 'PTLG018_4', 'PTLG019_4', 'PTLG020_4',
       'PTLG022_4', 'PTLG024_4', 'PTLG029_4'],
      dtype='obje

In [5]:
datasets['immunesystem'].index

Index(['PTLG002_BL', 'PTLG003_BL', 'PTLG004_BL', 'PTLG005_BL', 'PTLG007_BL',
       'PTLG008_BL', 'PTLG009_BL', 'PTLG010_BL', 'PTLG012_BL', 'PTLG015_BL',
       'PTLG016_BL', 'PTLG018_BL', 'PTLG019_BL', 'PTLG020_BL', 'PTLG022_BL',
       'PTLG024_BL', 'PTLG029_BL', 'PTLG002_1', 'PTLG003_1', 'PTLG004_1',
       'PTLG005_1', 'PTLG007_1', 'PTLG008_1', 'PTLG009_1', 'PTLG010_1',
       'PTLG012_1', 'PTLG015_1', 'PTLG016_1', 'PTLG018_1', 'PTLG019_1',
       'PTLG020_1', 'PTLG022_1', 'PTLG024_1', 'PTLG029_1', 'PTLG002_2',
       'PTLG003_2', 'PTLG004_2', 'PTLG005_2', 'PTLG007_2', 'PTLG008_2',
       'PTLG009_2', 'PTLG010_2', 'PTLG012_2', 'PTLG015_2', 'PTLG016_2',
       'PTLG018_2', 'PTLG019_2', 'PTLG020_2', 'PTLG022_2', 'PTLG024_2',
       'PTLG029_2', 'PTLG002_3', 'PTLG003_3', 'PTLG004_3', 'PTLG005_3',
       'PTLG007_3', 'PTLG008_3', 'PTLG009_3', 'PTLG010_3', 'PTLG012_3',
       'PTLG015_3', 'PTLG016_3', 'PTLG018_3', 'PTLG019_3', 'PTLG020_3',
       'PTLG022_3', 'PTLG024_3', 'PTLG029_3'],


Since this data was loaded with the author's function, I was rather confused. 

Upon further investigation it seems that there are some discrepancies between the label and the value of an R vector. Based on intuition, the values of the response, and the author's code I hesitantly conclude that each dataset is correctly ordered, and I will simply change the value of the indices to PTLG00X_1 through PTLG00X_4. In practice I would most likely contact the author as this could severely effect my results if incorrect. I make this change below:

In [6]:
correctIndex = datasets['cellfreerna'].index
for name, data in datasets.items():
    if data.index.any() != correctIndex.any():
        data.index = correctIndex

Below are the shapes for each dataset.

In [7]:
for name, data in datasets.items():
    print(name, 'has dimensions', data.shape)

response has dimensions (68,)
cellfreerna has dimensions (68, 37101)
plasmaluminex has dimensions (68, 62)
serumluminex has dimensions (68, 62)
microbiome has dimensions (68, 18548)
immunesystem has dimensions (68, 534)
metabolomics has dimensions (68, 3485)
plasmasomalogic has dimensions (68, 1300)


## Duplicate Data Investigation

Let's check for duplicates in the rest of the datasets:

In [8]:
def checkDuplicates(data, name):
    nrowsduplicated = sum(data.duplicated())
    ncolsduplicated = sum(data.transpose().duplicated())
    print('There are {} duplicate rows and {} duplicate columns in the {} dataset'.format(nrowsduplicated, ncolsduplicated, name))

In [9]:
for name, data in datasets.items():
    if name in ['response']:
        continue
    checkDuplicates(data, name)

There are 7 duplicate rows and 7184 duplicate columns in the cellfreerna dataset
There are 0 duplicate rows and 0 duplicate columns in the plasmaluminex dataset
There are 6 duplicate rows and 0 duplicate columns in the serumluminex dataset
There are 29 duplicate rows and 16627 duplicate columns in the microbiome dataset
There are 0 duplicate rows and 0 duplicate columns in the immunesystem dataset
There are 0 duplicate rows and 0 duplicate columns in the metabolomics dataset
There are 3 duplicate rows and 0 duplicate columns in the plasmasomalogic dataset


I am wary of removing duplicate rows due to my lack of domain expertise, though I will take note that nearly half of the rows in the microbiome dataset are duplicated. However I do believe it is safe to remove duplicate columns as I believe they do not provide any extra information:

In [10]:
datasets['cellfreerna'] = datasets['cellfreerna'].transpose().drop_duplicates().transpose()
datasets['microbiome'] = datasets['microbiome'].transpose().drop_duplicates().transpose()

## Feature Selection

### Sparsity

First I will check and remove extremely sparse features, as the data loading function written by the author only exclude completely sparse features.

In [12]:
def sparseFeatures(data, threshold, lengthOnly = False, name = 'dataset'):
    '''
    data: pandas DataFrame object
    threshold: percentage of zero values within a feature necessary to be qualified as sparse
    Returns a nested list of sparse features and their respective zero counts 
    '''
    zeroCounts = []
    sparsefeatures = []
    n = data.shape[0]
    for column in data.columns:
        count = sum(data[column] == 0)
        zeroCounts.append([column,count])
    for lists in zeroCounts:
        if lists[1] > threshold*n:
            sparsefeatures.append([lists[0], lists[1]])
    if len(sparsefeatures) == 0:
        print('There are no sparse features in {} dataset'.format(name))
        return []
    if lengthOnly:
        return len(sparsefeatures)
    else:
        return sparsefeatures

In [13]:
for name, data in datasets.items():
    if name in ['response']:
        continue
    sf = sparseFeatures(data, 0.8, name=name)
    if len(sf) != 0:
        print('{} dataset has {} sparse features. These will be removed.'.format(name, len(sf)))
        datasets[name] = datasets[name].drop([x[0] for x in sf], axis = 1)

cellfreerna dataset has 11718 sparse features. These will be removed.
There are no sparse features in plasmaluminex dataset
There are no sparse features in serumluminex dataset
There are no sparse features in microbiome dataset
There are no sparse features in immunesystem dataset
There are no sparse features in metabolomics dataset
There are no sparse features in plasmasomalogic dataset


As you can see the cellfreerna dataset had nearly 12 thousand features each having more than 80% zeros.

### Variance Thresholding

Let's investigate the variance of these features:

In [14]:
from sklearn.feature_selection import VarianceThreshold

In [15]:
response = datasets.pop('response', None)

In [16]:
selected = {}
for name, data in datasets.items():
    X = data
    selector = VarianceThreshold()
    selector.fit(X)
    if sum(selector.variances_ <= 0.0001):
        selected[name] = selector.variances_ <= 0.0001
        print('The {} dataset has {} features with a variance less than 0.00001'.format(name,sum(selector.variances_ <= 0.0001)))

The microbiome dataset has 703 features with a variance less than 0.00001
The immunesystem dataset has 10 features with a variance less than 0.00001


With such a low variance, they most likely do not contribute much to the response. Let's drop those:

In [17]:
datasets['microbiome'] = datasets['microbiome'].loc[:,~selected['microbiome']]
datasets['immunesystem'] = datasets['immunesystem'].loc[:,~selected['immunesystem']]

### Feature Selection via Random Forest

In [18]:
datasets1 = copy.deepcopy(datasets)

In [19]:
for data in datasets:
    datasets[data] = datasets[data].iloc[0:51,:]
response = response[0:51]

In [20]:
selectedFeatures = {}
selectedFrom = {}
for name, data in datasets.items():
    X = data
    y = response
    params = {
            "n_estimators" : [10], #,50,100],
            "max_features" : ["auto"], #, "log2", "sqrt"], ## grid removed due to computational limitations
            "bootstrap"    : [True] #, False]
        }
    
    #LOPOCV
    groups = np.array(list(range(1,18))*3)
    logo = LeaveOneGroupOut()
    cv = logo.split(X, y, groups=groups)
    #Tuning Model 
    rf = RandomForestRegressor()
    grid = GridSearchCV(rf, params, scoring='neg_mean_squared_error', cv=cv, iid=False)
    grid.fit(X, y)
    rf_tuned = grid.best_estimator_
    selectedfrom = []
    for feature, value in zip(X.columns, rf_tuned.feature_importances_):
        selectedfrom.append((feature, value))
    if name == 'cellfreerna':
        sfm = SelectFromModel(rf_tuned, threshold=0.00009) 
    else:
        sfm = SelectFromModel(rf_tuned, threshold='mean')
    # Train the selector
    sfm.fit(X, y)
    selected = X.columns[sfm.get_support()]
    selectedFeatures[name] = selected
    selectedFrom[name] = selectedfrom
    print('{} features were selected for the {} dataset.'.format(len(selected),name))

113 features were selected for the cellfreerna dataset.
14 features were selected for the plasmaluminex dataset.
19 features were selected for the serumluminex dataset.
55 features were selected for the microbiome dataset.
57 features were selected for the immunesystem dataset.
75 features were selected for the metabolomics dataset.
40 features were selected for the plasmasomalogic dataset.


In [21]:
for name in datasets:
    datasets[name] = datasets[name][selectedFeatures[name]]

In [22]:
pickle_out = open("datasets_selected.pkl","wb")
pickle.dump(datasets, pickle_out)
pickle_out.close()

In [23]:
pickle_out = open("response.pkl","wb")
pickle.dump(response, pickle_out)
pickle_out.close()