# DSC180B EDA

# Importing Packages

In [None]:
import json
from src.data import make_dataset
from src.features import build_features


In [None]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
from qiime2.plugins.sample_classifier.pipelines import classify_samples
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import seaborn as sns
# %matplotlib inline 

## Loading Data

In [None]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [None]:
table = make_dataset.read_feature_table(file_paths["feature_table_path"])
metadata = make_dataset.read_metadata(file_paths["metadata_path"])

In [None]:
biom_table = table.view(biom.Table)
print(biom_table.head())

In [None]:
metadata.head()

## Subset of Metadata

In [None]:
# Subset of data
disease_cols = ['abdominal_obesity_ncep_v2', 'ckd_v2', 'diabetes2_v2','hypertension2_v2','precvd_v2','elevated_bp_selfmeds_v2','dyslipidemia_v2']
features = disease_cols + ['gender_v2','agegroup_c6_v2','bmigrp_c6_v2','center','us_born_v2','income_c5_v2']

In [None]:
numeric_col = [
    'abdominal_obesity_ncep_v2','ckd_v2','diabetes2_v2',
    'hypertension2_v2','precvd_v2','elevated_bp_selfmeds_v2',
    'dyslipidemia_v2','agegroup_c6_v2',
    'bmigrp_c6_v2','us_born_v2','income_c5_v2']
categorical_col = ['center','gender_v2']

In [None]:
sub_metadata = metadata[features]
sub_metadata.head()

## Missing Data Analysis

In [None]:
# Function for single representation of missing values
def missing_values(col, type='categorical'):
    temp = col.apply(lambda x: np.nan if x == 'not applicable' or x == 'not provided' else x)
    if type == 'numeric':
        temp = temp.apply(lambda x: x if pd.isnull(x) else np.float64(x))
    return temp

In [None]:
metadata['ckd_v2'].value_counts()

In [None]:
missing_values(metadata['ckd_v2'],'numeric').value_counts()

## Replacing missing values in metadata subset

In [None]:
sub_metadata.loc[:,numeric_col] = sub_metadata.loc[:,numeric_col].apply(lambda x: missing_values(x,'numeric'))
sub_metadata.loc[:,categorical_col] = sub_metadata.loc[:,categorical_col].apply(lambda x: missing_values(x,'categorical'))

In [None]:
sub_metadata.head()

## Dropping Missing Data

In [None]:
sub_metadata_no_nan = sub_metadata.dropna()
sub_metadata_no_nan.head()

## Converting column dtypes

In [None]:
convert_dict = {x: np.float64 for x in numeric_col}
sub_metadata_no_nan = sub_metadata_no_nan.astype(convert_dict)

## Converting diabetes and ckd into binary variables

In [None]:
diabetes_binary = {1.0:0.0,
                   2.0:0.0,
                   3.0:1.0}
ckd_binary = {1.0:0.0, 
              2.0:1.0,
              3.0:1.0,
              4.0:1.0,
              5.0:1.0}

In [None]:
sub_metadata_no_nan['diabetes2_v2'] = sub_metadata_no_nan['diabetes2_v2'].apply(lambda x: diabetes_binary[x])
sub_metadata_no_nan['ckd_v2'] = sub_metadata_no_nan['ckd_v2'].apply(lambda x:ckd_binary[x])

## Filtering metadata with feature table samples

In [None]:
sub_metadata_no_nan.head()

In [None]:
# unsure if i should filter samples in metadata
final_metadata = sub_metadata_no_nan.loc[sub_metadata_no_nan.index.isin(biom_table.ids())]
final_metadata.head()

## Saving final metadata

In [None]:
# have to convert to T and F for qiime2 models
def binary_to_tf(val):
    if val == 1.0:
        return 'T'
    elif val == 0.0:
        return 'F'
    else:
        return 'missing'

In [None]:
final_metadata.to_csv("data/temp/final_metadata.tsv",sep="\t")
## Loading Metadata as qiime Metadata object
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv")

## EDA on Subset of Metadata

### Disease Counts

In [None]:
def create_bar_col_binary(df, col_name):
    ax = df[col_name].value_counts().sort_index(ascending=False).plot(kind='barh')
    ax.set_xlabel('count')
    ax.set_ylabel('outcome')
    ax.set_title(col_name)

In [None]:
create_bar_col_binary(final_metadata, 'abdominal_obesity_ncep_v2')

### Gender Counts

In [None]:
final_metadata['gender_v2'].value_counts().plot(kind='barh')

### Center Counts

In [None]:
final_metadata['center'].value_counts().plot(kind='barh')

### Count of individual diseases

In [None]:
final_metadata[disease_cols].sum().sort_values(ascending=False).plot(kind='bar')

### Num people who have multiple diseases

In [None]:
final_metadata['total_diseases'] = final_metadata[disease_cols].sum(axis=1)
final_metadata['total_diseases'].value_counts().plot(kind = 'barh')
final_metadata['total_diseases'].value_counts()

### Co-occurence between disease types

In [None]:
sns.heatmap(final_metadata[disease_cols].T.dot(final_metadata[disease_cols]))

### Filtering feature table samples...Can probably skip this and instead filter features

In [None]:
updated_feature_table = filter_samples(table, metadata = qiime_metadata).filtered_table
updated_feature_table

In [None]:
biom_table = updated_feature_table.view(biom.Table)
print(biom_table.head())

# Model Creation

In [None]:
## Need to switch to T and F for qiime models, need to move this to earlier step
final_metadata['ckd_v2'] = final_metadata['ckd_v2'].apply(lambda x: binary_to_tf(x))
final_metadata.to_csv("data/temp/final_metadata.tsv",sep="\t")
## Loading Metadata as qiime Metadata object
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv")

In [None]:
qiime_metadata

### Qiime model

In [None]:
classify_samples(updated_feature_table, qiime_metadata.get_column('ckd_v2'), missing_samples='ignore')