# Importing Packages

In [None]:
import json
import os

In [None]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
from qiime2.plugins.metadata.methods import distance_matrix
import seaborn as sns

## Importing Data

In [None]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [None]:
feature_table = Artifact.load(file_paths["feature_table_path"])
metadata = pd.read_csv(file_paths["metadata_path"], sep='\t', index_col=0)

In [None]:
biom_table = feature_table.view(biom.Table)
print(biom_table.head())

In [None]:
metadata.head()

## Missing Data Analysis

In [None]:
# Function for single representation for missing values
def missing_values(col, type='categorical'):
    temp = col.apply(lambda x: np.nan if x == 'not applicable' or x == 'not provided' else x)
    if type == 'numeric':
        temp = temp.apply(lambda x: x if pd.isnull(x) else np.float64(x))
    return temp

In [None]:
missing_values(metadata['abdominal_obesity_ncep_v2'],'numeric').value_counts()

In [None]:
missing_values(metadata['abdominal_obesity_ncep_v2'],np.int64).isnull().sum() * 100 / len(metadata) 

In [None]:
# Potential Columns -- more to be added 
# ['abdominal_obesity_idf_v2','abdominal_obesity_ncep_v2','anonymized_name', 'age_v2','bmi_v2','center','ckd2',
# 'diabetes2_v2','dm_aware_v2','dyslipidemia_v2','education_c2_v1','elevated_bp_selfmeds_v2','gender','host_age',
# 'host_body_mass_index','hypertension2_v2','placeofbirth_group','precvd','us_born_v2',]

# Subset of Metadata

In [None]:
sub_metadata = metadata[['abdominal_obesity_idf_v2','age_v2','center','ckd2','diabetes2_v2','gender','host_body_mass_index','dyslipidemia_v2','elevated_bp_selfmeds_v2','hypertension2_v2','precvd']]
sub_metadata.head()

In [None]:
numeric_col = ['abdominal_obesity_idf_v2','age_v2','ckd2','diabetes2_v2','host_body_mass_index','dyslipidemia_v2','elevated_bp_selfmeds_v2','hypertension2_v2','precvd']
categorical_col = ['center','gender']

## Defining missing values

In [None]:
sub_metadata.loc[:,numeric_col] = sub_metadata[numeric_col].apply(lambda x: missing_values(x,'numeric'))
sub_metadata.loc[:,categorical_col] = sub_metadata[categorical_col].apply(lambda x: missing_values(x,'categorical'))

In [None]:
sub_metadata.isnull().sum() * 100 / len(sub_metadata) # calculates percent of nan in column


## EDA on Subset of Metadata

In [None]:
sub_metadata

In [None]:
def create_bar_col_binary(df, col_name):
    ax = df[col_name].fillna(-1).value_counts().sort_index(ascending=False).plot(kind='barh')
    ax.set_xlabel('count')
    ax.set_ylabel('outcome')
    ax.set_title(col_name)

In [None]:
create_bar_col_binary(sub_metadata, 'dyslipidemia_v2')