# Importing Packages

In [None]:
import json
import os

In [None]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import seaborn as sns

## Importing Data

In [None]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [None]:
feature_table = Artifact.load(file_paths["feature_table_path"])
metadata = pd.read_csv(file_paths["metadata_path"], sep='\t', index_col=0)

In [None]:
biom_table = feature_table.view(biom.Table)
print(biom_table.head())

In [None]:
metadata.head()

## Missing Data Analysis

In [None]:
# Function for single representation for missing values
def missing_values(col, type='categorical'):
    temp = col.apply(lambda x: np.nan if x == 'not applicable' or x == 'not provided' else x)
    if type == 'numeric':
        temp = temp.apply(lambda x: x if pd.isnull(x) else np.float64(x))
    return temp

In [None]:
# Potential Columns -- more to be added 
features = ['abdominal_obesity_ncep_v2', 'ckd_v2','diabetes2_v2','precvd_v2', 'elevated_bp_selfmeds_v2', 'high_total_chol2_v2',
'gender_v2','agegroup_c6_v2','bmigrp_c6_v2','center','us_born_v2','income_c5_v2'] # 'host_age'

In [None]:
metadata['abdominal_obesity_ncep_v2'].value_counts()

In [None]:
metadata['ckd_v2'].value_counts()

In [None]:
missing_values(metadata['ckd_v2'],'numeric').value_counts()

# Subset of Metadata

In [None]:
sub_metadata = metadata[features]
sub_metadata.head()

In [None]:
numeric_col = ['abdominal_obesity_ncep_v2','ckd_v2','diabetes2_v2','precvd_v2','elevated_bp_selfmeds_v2','high_total_chol2_v2','agegroup_c6_v2','bmigrp_c6_v2','us_born_v2','income_c5_v2']
categorical_col = ['center','gender_v2']

## Defining missing values

In [None]:
sub_metadata.loc[:,numeric_col] = sub_metadata[numeric_col].apply(lambda x: missing_values(x,'numeric'))
sub_metadata.loc[:,categorical_col] = sub_metadata[categorical_col].apply(lambda x: missing_values(x,'categorical'))

In [None]:
sub_metadata.isnull().sum() * 100 / len(sub_metadata) # calculates percent of nan in column


## EDA on Subset of Metadata

In [None]:
metadata['ckd_v2'].value_counts()

In [None]:
sub_metadata['ckd_v2'].value_counts()

In [None]:
sub_metadata.describe()

### Disease Counts

In [None]:
def create_bar_col_binary(df, col_name):
    ax = df[col_name].fillna(-1).value_counts().sort_index(ascending=False).plot(kind='barh')
    ax.set_xlabel('count')
    ax.set_ylabel('outcome')
    ax.set_title(col_name)

In [None]:
create_bar_col_binary(sub_metadata, 'ckd_v2')

### Gender Counts

In [None]:
sub_metadata['gender_v2'].fillna('missing').value_counts().plot(kind='barh')

### Center Counts

In [None]:
sub_metadata['center'].fillna('missing').value_counts().plot(kind='barh')

### Co-occurence

In [None]:
sub_metadata

In [None]:
sub_metadata_no_nan = sub_metadata.dropna()
sub_metadata_no_nan.shape[0]/sub_metadata.shape[0]

In [None]:
convert_dict = {x:np.float64 for x in numeric_col}

In [None]:
sub_metadata_no_nan = sub_metadata_no_nan.astype(convert_dict)

In [None]:
disease_col = ['abdominal_obesity_ncep_v2','ckd_v2','diabetes2_v2','precvd_v2','elevated_bp_selfmeds_v2','high_total_chol2_v2']

In [None]:
sub_metadata[['ckd_v2','abdominal_obesity_ncep_v2']].dropna().T.dot(sub_metadata[['ckd_v2','abdominal_obesity_ncep_v2']].dropna())

In [None]:
sns.heatmap(sub_metadata_no_nan[disease_col].T.dot(sub_metadata_no_nan[disease_col]))

# Model Creation

In [None]:
def binary_to_tf(val):
    if val==1.0:
        return 'T'
    elif val==0.0:
        return 'F'
    else:
        return 'missing'

In [None]:
sub_metadata['abdominal_obesity_ncep_v2'] = sub_metadata['abdominal_obesity_ncep_v2'].apply(lambda x: binary_to_tf(x))

In [None]:
sub_metadata.to_csv("data/temp/updated_metadata.tsv",sep="\t")

convert df to qiime metadata

In [None]:
import os

### Qiime metadata

Only need the following if using Artifact API

In [None]:
from qiime2 import Metadata

In [None]:
qiime_metadata = Metadata.load("data/temp/updated_metadata.tsv")
qiime_metadata.save('qiime_metadata.qza')