In [1]:
import json
import os

from qiime2.plugins import feature_table
from qiime2 import Artifact
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import seaborn as sns

In [2]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

feature_table = pd.read_csv(file_paths["feature_table_path"])
metadata = pd.read_csv(file_paths["metadata_path"], sep='\t', index_col=0)

  metadata = pd.read_csv(file_paths["metadata_path"], sep='\t', index_col=0)


In [3]:
feature_table.shape

(1835, 58437)

In [4]:
metadata.shape

(5423, 175)

# Preprocess/clean the feature table

In [5]:
#Rename and embed the column names to save space
feature_table = feature_table.rename(columns={'Unnamed: 0' : 'sample_name'})

In [6]:
num_feats = feature_table.columns[1:].size
feature_columns = feature_table.columns[1:]

#Feature embeddings
feature_table = feature_table.rename(columns={x:y for x,y in zip(feature_table.columns[1:],range(1,num_feats+1))})
feature_table

Unnamed: 0,sample_name,1,2,3,4,5,6,7,8,9,...,58427,58428,58429,58430,58431,58432,58433,58434,58435,58436
0,11666.BLANK7.7B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11666.BLANK5.5B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11666.G0341A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11666.BLANK3.3A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11666.BLANK5.5E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1830,11666.G1518B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1831,11666.G1501A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1832,11666.G1685A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1833,11666.G1536A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# TAKE THIS OUT LATER #
# Sample the dataset so that runtimes are faster
sample_table = feature_table.sample(n=1000).set_index('sample_name')

In [8]:
#Making sure each sequence has more than 10,000 reads associated with it and has been seen in at least 3 different samples (in essence, applying a series of thresholds)

# Make sure the sequence has more than 10,000 reads total
sample_table = sample_table.loc[(sample_table.sum(axis='columns') > 10000)]
sample_table

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,58427,58428,58429,58430,58431,58432,58433,58434,58435,58436
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11666.G0244A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1687A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1260A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0288A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1221A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11666.G1228A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1003A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0617A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1296A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Make sure the sequence has been seen in at least 3 different samples
sample_table = sample_table[sample_table.columns[((sample_table > 0).sum() > 3)]]
sample_table

Unnamed: 0_level_0,10,11,16,24,320,322,331,363,376,400,...,57618,57632,57677,57685,57700,57811,57813,57814,57820,57821
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11666.G0244A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1687A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1260A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0288A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,855.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1221A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11666.G1228A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1003A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0617A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,114.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1296A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Drop the sample_name rows in the metadata corresponding to the feature table
sample_metadata = metadata.loc[sample_table.index]
sample_metadata

Unnamed: 0_level_0,abdominal_obesity_idf_v2,abdominal_obesity_ncep_v2,ac_ratio_gt30,ac_ratio_gt30_v2,age_units,age_v2,agegroup_c6_nhanes_v2,agegroup_c6_v2,anonymized_name,antibiotic,...,taxon_id,title,type_stool,us_born_v2,weight_norm_overall_v2,weight_units,yes_no,yogurt,yrs_btwn_viv2,yrsus_c2_v2
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11666.G0244A,1,1,0,0,years,60,5,5,G0244,2,...,408170.0,GOLD study,3,0,0.442121,not provided,yes,2,5.478439425,
11666.G1687A,1,1,0,0,years,63,5,5,G1687,2,...,408170.0,GOLD study,6,0,0.356924324,not provided,,2,5.681040383,
11666.G1260A,1,1,,0,years,62,5,5,G1260,,...,408170.0,GOLD study,,0,0.395096992,not provided,,,5.234770705,2
11666.G0288A,1,1,0,0,years,53,4,4,G0288,2,...,408170.0,GOLD study,4,1,0.07816889,not provided,yes,2,5.587953457,2
11666.G1221A,1,1,0,0,years,60,5,5,G1221,2,...,408170.0,GOLD study,4,0,0.250034041,not provided,,1,5.270362765,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11666.G1228A,1,1,0,1,years,61,5,5,G1228,2,...,408170.0,GOLD study,,0,0.186084038,not provided,,2,7.099247091,2
11666.G1003A,1,1,0,0,years,60,5,5,G1003,2,...,408170.0,GOLD study,2,0,0.307013663,not provided,,2,5.327857632,2
11666.G0617A,1,0,1,1,years,46,3,4,G0617,2,...,408170.0,GOLD study,2,0,1.75702164,not provided,,2,6.557152635,2
11666.G1296A,1,1,0,0,years,28,1,2,G1296,1,...,408170.0,GOLD study,6,1,4.062257656,not provided,,1,6.976043806,2


# Run UMAP on the feature table

In [11]:
import umap
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
reducer = umap.UMAP(n_components = 2, n_neighbors = 15, metric = 'jaccard', random_state = 0)
embeddings = reducer.fit_transform(sample_table)

  warn(


In [13]:
plot_df = pd.DataFrame(data = embeddings, columns = ['dim1', 'dim2'], index = sample_table.index)


In [14]:
#add the labels for gender back into the UMAP embedding
plot_df = plot_df.merge(sample_metadata['us_born_v2'], on = 'sample_name')

umap_plot = sns.scatterplot(x = 'dim1', y = 'dim2', data = plot_df, hue='us_born_v2')
plt.show()
plt.savefig("umap1.png")
plt.close()

  plt.show()


In [15]:
#TODO Need to implement a hyperparameter tuning/cross validation method

In [16]:
#TODO figure out the columns we want to cluster, classify and visualize

In [17]:
#TODO research on different UMAP tasks that can be done

In [18]:
#TODO research 3D plots, currently only doing 2D using n_dimension of 2 for UMAP parameter