# DSC180b Capstone Project

## Importing Packages

In [None]:
import json
import os

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# import torch as t  --> idk why this isn't being found, maybe memory issue
# import skmultilearn

# import biom
# from qiime2.plugins import feature_table
# from qiime2 import Artifact
# from qiime2.plugins.metadata.methods import distance_matrix


## Loading Data

In [None]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [None]:
raw_feature_table = pd.read_csv('data/raw/feature_table.csv').set_index('Unnamed: 0')
raw_feature_table.head()

In [None]:
raw_metadata = pd.read_csv('data/raw/11666_metadata.txt', sep='\t', index_col=0)
raw_metadata.head()

## Cleaning Metadata

Subset metadata based on existing samples in feature table

In [None]:
raw_metadata = raw_metadata.loc[raw_feature_table.index]

Keep relevant diseases (classes) and features in metadata

In [None]:
diseases_cols = {
    'abdominal_obesity_ncep_v2': 'obesity',
    'diabetes2_v2': 'diabetes',
    'dyslipidemia_v2': 'dyslipidemia',
    'hypertension2_v2': 'hypertension',
    'ckd_v2': 'ckd',
    'precvd_v2': 'precvd',
    'elevated_bp_selfmeds_v2': 'elevated_bp',
}
           
other_feature_cols = {
    'age_v2': 'age',
    'center': 'center',
    'gender': 'gender',
    'host_body_mass_index': 'BMI'
}

subset_cols = diseases_cols | other_feature_cols

metadata = raw_metadata[subset_cols.keys()].rename(columns=subset_cols)
metadata

Remove missing values

In [None]:
nan_rows = metadata[metadata.isna().any(axis=1)].index
na_rows = metadata[(metadata == 'not applicable').all(axis=1)].index
np_rows = metadata[(metadata == 'not provided').all(axis=1)].index
drop_rows = np.concatenate((nan_rows, na_rows, np_rows))

filtered_metadata = metadata.drop(drop_rows)
filtered_metadata

Map values in each class to binary. 1 in a column means a sample contains the corresponding disease, and 0 means it does not.

In [None]:
disease_metadata = filtered_metadata[diseases_cols.values()].astype(int)

for col in disease_metadata:
    print(col, disease_metadata[col].unique())

In [None]:
## DISEASE VALUE MAPS

diabetes_binary = {1: 0,
                   2: 0,
                   3: 1}
ckd_binary = {1: 0,
             2: 1,
             3: 1,
             4: 1,
             5: 1}

disease_metadata['diabetes'] = disease_metadata['diabetes'].map(diabetes_binary)
disease_metadata['ckd'] = disease_metadata['ckd'].map(ckd_binary)

diseases = disease_metadata
diseases.head()

## Cleaning Feature Table

Subset feature table based on existing samples in cleaned metadata

In [None]:
raw_feature_table = raw_feature_table.loc[diseases.index]

In [None]:
raw_feature_table.sum(axis=0).value_counts()

Remove noise by filtering out sequences that have fewer than 100,000 reads

In [None]:
threshold = 100_000
filtered_cols = raw_feature_table.sum(axis=0) > threshold
features = raw_feature_table.T.loc[filtered_cols].T
features

## Metadata EDA

In [None]:
eda_df = diseases.copy()
eda_df['total_diseases'] = eda_df.sum(axis=1) #how many diseases each sample has
eda_df.tail()

In [None]:
## DISEASE PER SAMPLE COUNTS

multiple_disease_counts = eda_df['total_diseases'].value_counts()

fig, ax = plt.subplots(figsize=(10,5))

ax.set_title('Disease per Sample Counts')
ax.set_ylabel('Number of Samples')
ax.set_xlabel('Number of Diseases')
ax.bar(multiple_disease_counts.index, multiple_disease_counts.values)


In [None]:
## SIZE OF EACH CLASS (DISEASE)

disease_counts = eda_df.drop(columns='total_diseases').sum(axis=0)

fig, ax = plt.subplots(figsize=(15, 5))

ax.set_title('Samples per Disease')
ax.set_ylabel('Number of Samples')
ax.set_xlabel('Diseases')
ax.bar(disease_counts.index, disease_counts.values)

## Data Preparation for Machine Learning

In [None]:
## In case we use pytorch

# setting device on GPU if available, else CPU
device = t.device('cuda' if t.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(t.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(t.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(t.cuda.memory_reserved(0)/1024**3,1), 'GB')

## Machine Learning Models

### Binary Relevance

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def init_gbc_model():
    loss='exponential'
    learning_rate=0.1 
    n_estimators=150 
    max_depth=3
    random_state=0
    
    clf = GradientBoostingClassifier(loss=loss, 
                                     learning_rate=learning_rate, 
                                     n_estimators=n_estimators, 
                                     max_depth=max_depth, 
                                     random_state=random_state)
    
    return clf



def init_skf():
    """Function for initializing the StratifiedKFold cross-validator

    Args:
        n_splits (int): Number of splits to seperate data
        shuffle (boolean): Wheter to shuffle data
        skf_random_state (int): random_state

    Returns:
        StratifiedKFold: Initialized StratifiedKFold cross-validator
    """
    n_splits=10
    random_state=0
    shuffle=True
    
    skf = StratifiedKFold(n_splits=n_splits,shuffle=shuffle, random_state=random_state)
    
    return skf

#Maybe build tuning pipeline later on

In [None]:
classifiers = {}
metrics = {}

X = features
for i, disease in enumerate(diseases, start=1):
    skf = init_skf()
    clf = init_gbc_model()
    
    y = diseases[disease]
    
    best_acc = float('-inf')
    
    print('Training {} Classifier...'.format(disease))
    i=1
    for train_index, val_index in skf.split(X, y):
        
        train_X, train_y = X.iloc[train_index], y.iloc[train_index]
        val_X, val_y = X.iloc[val_index], y.iloc[val_index]

        clf.fit(train_X, train_y)
        
        preds = clf.predict(val_X)
#         preds = clf.predict_proba(val_X)[:,1] #predict probability of positive class predict
        
        acc = np.mean(preds==val_y)
        
        if acc >= best_acc:
            best_model = clf
            best_acc = acc
            
        print('Finished training split {}'.format(i))
        i+=1
        
    classifiers[disease] = best_model
    metrics[disease] = best_acc


In [None]:
classifiers

In [None]:
metrics

### Classifier Chain

### Label Powerset

### Adapted Algorithms