In [5]:
import json
import os
import sys
import inspect

from src.data import make_dataset
from src.features import build_features
from src.features import metrics_analysis

2023-02-28 21:46:35.827487: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 21:46:36.685074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-28 21:46:36.685141: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-28 21:46:39.133050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
## Creating paths to store temp and out data ##
if not os.path.exists("data/temp"):
    os.makedirs("data/temp")
if not os.path.exists("data/out"):
    os.makedirs("data/out")

In [7]:
from qiime2.plugins import feature_table
from qiime2 import Artifact
from qiime2.plugins.sample_classifier.pipelines import classify_samples
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
from qiime2.plugins.diversity.pipelines import core_metrics
from qiime2.plugins.diversity.pipelines import core_metrics_phylogenetic
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.diversity.methods import umap


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import biom
import skbio
import umap

import seaborn as sns
# %matplotlib inline 

## Loading Data

In [8]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [9]:
table = make_dataset.read_feature_table(file_paths["feature_table_path"])
tree = make_dataset.read_tree_table(file_paths["tree_path"])

## Load Metadata

In [10]:
qiime_metadata = Metadata.load("data/temp/final_metadata.tsv") #Cleaned metadata
metadata_df = qiime_metadata.to_dataframe()

## Dividing Metadata based on Hispanic Origin

We are going to divide and group up our data based on the hispanic origin variable to see if there are any patterns we can discover from the groupings

Metadata Description: (1=Dominican, 2=Central American, 3=Cuban, 4=Mexican, 5=Puerto Rican, 6=South American, 7=More than one heritage, 8=other)

In [13]:
#Describe the hispanic_origin column values
metadata_df['hispanic_origin'].value_counts()

4.0    863
5.0    220
3.0    216
2.0    170
1.0    123
6.0    117
7.0     28
8.0     10
Name: hispanic_origin, dtype: int64

### Select hispanic origin samples of interest
For now, look into the top 3 hispanic origin categories: Mexican, Puerto Rican, and Cuban origin

In [14]:
metadata_df_m = metadata_df.loc[(metadata_df['hispanic_origin'] == 4)] #Mexico
metadata_df_pr = metadata_df.loc[(metadata_df['hispanic_origin'] == 5)] #Puerto Rico
metadata_df_c = metadata_df.loc[(metadata_df['hispanic_origin'] == 3)] #Cuba    


In [15]:

metadata_df_m.to_csv('data/temp/final_metadata_m.tsv', sep="\t")
metadata_df_pr.to_csv('data/temp/final_metadata_pr.tsv', sep="\t")
metadata_df_c.to_csv('data/temp/final_metadata_c.tsv', sep="\t")

In [16]:
metadata_m = Metadata.load("data/temp/final_metadata_m.tsv") 
metadata_pr = Metadata.load("data/temp/final_metadata_pr.tsv") 
metadata_c = Metadata.load("data/temp/final_metadata_c.tsv") 

Create the 3 filtered feature tables for the 3 hispanic origins

In [17]:
feature_table_m = filter_samples(table, metadata = metadata_m).filtered_table
df_feature_table_m = feature_table_m.view(pd.DataFrame)

feature_table_pr = filter_samples(table, metadata = metadata_pr).filtered_table
df_feature_table_pr = feature_table_pr.view(pd.DataFrame)

feature_table_c = filter_samples(table, metadata = metadata_c).filtered_table
df_feature_table_c = feature_table_c.view(pd.DataFrame)

In [18]:
print(df_feature_table_m.shape)
print(df_feature_table_pr.shape)
print(df_feature_table_c.shape)

(863, 31600)
(220, 9465)
(216, 9567)


## Feature Table Exploration

In [11]:
#Filter feature table based on cleaned metadata
updated_feature_table = filter_samples(table, metadata = qiime_metadata).filtered_table
biom_table = updated_feature_table.view(biom.Table)
biom_table #around 4000 rows was removed

57181 x 1747 <class 'biom.table.Table'> with 342969 nonzero entries (0% dense)

In [12]:
summary = summarize(updated_feature_table, qiime_metadata)
summary.visualization.save('data/out/ft_summary')

'data/out/ft_summary.qzv'

From the summary visualizations and statistics, we see that most features only appear in less than 3 samples, therefore we are going to drop the features that appear less than 3 times in order to reduce noise.

In [13]:
feat_table = updated_feature_table.view(pd.DataFrame)
feat_table.shape

(1747, 57181)

In [14]:
#Drop the feature columns that appear in less than 3 samples
feat_table_3 = feat_table[feat_table.columns[((feat_table > 0).sum() > 3)]]

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table = Artifact.import_data("FeatureTable[Frequency]", feat_table_3)

summary_cleaned = summarize(cleaned_feature_table, qiime_metadata)
summary_cleaned.visualization.save('data/out/ft_summary_3')

'data/out/ft_summary_3.qzv'

In [15]:
feat_table_3.shape

(1747, 3985)

### Perform Feature Table Cleaning on hispanic_origin groups

In [24]:
#Drop the feature columns that appear in less than 3 samples
df_feature_table_m = df_feature_table_m[df_feature_table_m.columns[((df_feature_table_m > 0).sum() > 3)]]
df_feature_table_pr = df_feature_table_pr[df_feature_table_pr.columns[((df_feature_table_pr > 0).sum() > 3)]]
df_feature_table_c = df_feature_table_c[df_feature_table_c.columns[((df_feature_table_c > 0).sum() > 3)]]

#Import DataFrame back into FeatureTable artifact and export the summary
cleaned_feature_table_m = Artifact.import_data("FeatureTable[Frequency]", df_feature_table_m)
cleaned_feature_table_pr = Artifact.import_data("FeatureTable[Frequency]", df_feature_table_pr)
cleaned_feature_table_c = Artifact.import_data("FeatureTable[Frequency]", df_feature_table_c)


In [25]:
#Create the summary visualizations and export the files to view

summary_m = summarize(cleaned_feature_table_m, metadata_m)
summary_m.visualization.save('data/out/ft_summary_m')

summary_pr = summarize(cleaned_feature_table_pr, metadata_pr)
summary_pr.visualization.save('data/out/ft_summary_pr')

summary_c = summarize(cleaned_feature_table_c, metadata_c)
summary_c.visualization.save('data/out/ft_summary_c')

'data/out/ft_summary_c.qzv'

# Feature Table Metrics Analysis

First figure out the feature table rarefication, save the plots generated by Qiime2 core_metrics, then create the following:
1. Distance matrices: Unifrac distance matrix, Jaccard distance matrix, Bray Curtis distance matrix
2. PCOA plots with the different distance matrices
    - save the plots for visualization
    
    
3. UMAP plots with the different distance matrices
    - save the plots for visualization
    
    
4. Finally follow up with a statistical test or regression
    - Ex) PERMANOVA test on PCOA results
    - Ex) Use the reduced dimension embeddings to feed into the regression model

With the information we gained from the summary, we decided to rarefy the table with a sampling depth of _7930_ we retained _12,489,750 (52.48%) features in 1575 (90.00%) samples_ at the specifed sampling depth. We made this decision to maximize the amount of features while preserving the amount of samples in our data.

In [26]:
# # Create the feature table metrics object

# feat_table = cleaned_feature_table
# depth = 7930
# metadata = qiime_metadata
# feature_table_metrics = metrics_analysis.extract_core_metrics(feat_table, depth, metadata)


In [27]:
# Create the feature table metrics object with tree
feat_table = cleaned_feature_table
depth = 7930
metadata = qiime_metadata
feature_table_metrics_phy = core_metrics_phylogenetic(feat_table, tree, depth, metadata)

feature_table_metrics_phy

  warn(
  warn(


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

faithpd -i /tmp/qiime2/renaldy/data/9f8fad42-126e-448a-989d-fba00d06788d/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -o /tmp/q2-AlphaDiversityFormat-3rk3ylix

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

ssu -i /tmp/qiime2/renaldy/data/9f8fad42-126e-448a-989d-fba00d06788d/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -m unweighted -o /tmp/q2-LSMatFormat-p2y5o83a

Running external command line application. This may print messages to stdout and/or stderr.
The c

  warn(
  warn(


Results (name = value)
----------------------------------------------------------------------------------------------------------------------
rarefied_table                     = <artifact: FeatureTable[Frequency] uuid: 5c541f28-c6fe-4f5e-a9dc-06cba820207c>
faith_pd_vector                    = <artifact: SampleData[AlphaDiversity] uuid: 7d919ef7-af0c-40ba-bba4-0a11e79fa297>
observed_features_vector           = <artifact: SampleData[AlphaDiversity] uuid: a162baa8-57a8-423a-93d2-7f21701f170d>
shannon_vector                     = <artifact: SampleData[AlphaDiversity] uuid: f39f8601-d8d7-4b26-a462-9d5629e1fbb8>
evenness_vector                    = <artifact: SampleData[AlphaDiversity] uuid: 59b1f5ee-331a-456f-9fde-c6a45bce90f4>
unweighted_unifrac_distance_matrix = <artifact: DistanceMatrix uuid: d90b42a0-528e-43a7-a482-b26431a0f6bb>
weighted_unifrac_distance_matrix   = <artifact: DistanceMatrix uuid: 07529560-445d-4e26-9699-097bdd815717>
jaccard_distance_matrix            = <artifact: Dist

In [28]:
# Create the feature table metrics object with tree for Mexico
feat_table = cleaned_feature_table_m
depth = 7000
metadata = metadata_m
feature_table_metrics_m = core_metrics_phylogenetic(feat_table, tree, depth, metadata)

  warn(
  warn(


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

faithpd -i /tmp/qiime2/renaldy/data/715c465a-ac67-446c-a0ff-b2f9f5efcac7/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -o /tmp/q2-AlphaDiversityFormat-_duzkggq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

ssu -i /tmp/qiime2/renaldy/data/715c465a-ac67-446c-a0ff-b2f9f5efcac7/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -m unweighted -o /tmp/q2-LSMatFormat-_iwb73w2

Running external command line application. This may print messages to stdout and/or stderr.
The c

  warn(
  warn(


In [29]:
# Create the feature table metrics object with tree for Mexico
feat_table = cleaned_feature_table_pr
depth = 7000
metadata = metadata_pr
feature_table_metrics_pr = core_metrics_phylogenetic(feat_table, tree, depth, metadata)

  warn(


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

faithpd -i /tmp/qiime2/renaldy/data/8557146f-a4a3-4020-b60f-0f56ede5dda3/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -o /tmp/q2-AlphaDiversityFormat-4k_c_4tu

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

ssu -i /tmp/qiime2/renaldy/data/8557146f-a4a3-4020-b60f-0f56ede5dda3/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -m unweighted -o /tmp/q2-LSMatFormat-ubt78zic

Running external command line application. This may print messages to stdout and/or stderr.
The c

  warn(
  warn(


In [30]:
# Create the feature table metrics object with tree for Mexico
feat_table = cleaned_feature_table_c
depth = 7000
metadata = metadata_c
feature_table_metrics_c = core_metrics_phylogenetic(feat_table, tree, depth, metadata)

  warn(


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

faithpd -i /tmp/qiime2/renaldy/data/bf0b11b6-41f4-4d40-a2a3-6c736c91a55d/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -o /tmp/q2-AlphaDiversityFormat-odskl5ih

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command:

ssu -i /tmp/qiime2/renaldy/data/bf0b11b6-41f4-4d40-a2a3-6c736c91a55d/data/feature-table.biom -t /tmp/qiime2/renaldy/data/89eb2cfa-9e8c-462a-a9a7-93e7812fa767/data/tree.nwk -m unweighted -o /tmp/q2-LSMatFormat-20qoe4xe

Running external command line application. This may print messages to stdout and/or stderr.
The c

  warn(
  warn(


In [31]:
feature_table_metrics_m.weighted_unifrac_emperor.save('data/out/w_unifrac_pcoa_m')
feature_table_metrics_c.weighted_unifrac_emperor.save('data/out/w_unifrac_pcoa_c')
feature_table_metrics_pr.weighted_unifrac_emperor.save('data/out/w_unifrac_pcoa_pr')

'data/out/w_unifrac_pcoa_pr.qzv'

In [32]:
#Exploration of the dataset
# Print rarefied table
rarefied_table = feature_table_metrics.rarefied_table
rarefied_df = rarefied_table.view(pd.DataFrame)
rarefied_df.shape #Rarefied table dropped some samples that are less than the n=7930 sampling depth

NameError: name 'feature_table_metrics' is not defined

In [None]:
#Calculate Distance matrices
distance_matrices = metrics_analysis.extract_distance_matrices(feature_table_metrics)
jac_dis_matrix = distance_matrices[0]
bc_dis_matrix = distance_matrices[1]

(<artifact: DistanceMatrix uuid: a4110a9a-66d9-4502-a7b4-2b83f069231c>,
 <artifact: DistanceMatrix uuid: 4d96891d-81c6-419e-b18e-e16ff60cf05c>)

In [None]:
#Calculate the PCOA matrices
pcoa_matrices = metrics_analysis.extract_pcoa_results(feature_table_metrics)
pcoa_matrices

(<artifact: PCoAResults uuid: 947398a7-d746-4a29-a56f-d5a053a9952c>,
 <artifact: PCoAResults uuid: e017dac0-fdb3-43c2-a56d-22365d255e76>)

In [None]:
#Calculate the Emperor visualization and output
pcoa_emperor_plots = metrics_analysis.extract_pcoa_emperor_vis(feature_table_metrics)
pcoa_emperor_plots

(<visualization: Visualization uuid: 316ac20e-25fc-4203-af63-ecf93aa9a5a8>,
 <visualization: Visualization uuid: df780e27-f64b-4698-b748-c03af026898f>)

In [None]:
def save_pcoa_outputs(metrics):
    jac_pcoa = metrics.jaccard_pcoa_results
    bc_pcoa = metrics.bray_curtis_pcoa_results
    
    jaccard_emperor = metrics.jaccard_emperor
    bray_curtis_emperor = metrics.bray_curtis_emperor
    
    jac_pcoa.save('data/out/jac_pcoa_matrix')
    bc_pcoa.save('data/out/bc_pcoa_matrix')
    
    jaccard_emperor.save('data/out/jac_pcoa_emp')
    bray_curtis_emperor.save('data/out/bc_pcoa_emp')
    
    return

save_pcoa_outputs(feature_table_metrics)

### UMAP Dimensionality Reduction
Perform some tasks using the UMAP artifact in Qiime2 as well as the Supervised UMAP from UMAP library

In [None]:
#Code for UMAP using qiime2
for n in (2, 5, 10, 15, 20, 50):
    dim = 3
    umap_results = metrics_analysis.extract_umap_results(u_unifrac[0], dim, n)[0]
    umap_vis = metrics_analysis.extract_umap_vis(umap_results, metadata)[0]
    file_path = "data/out/umap_vis_u_unifrac_3_" + str(n)
    umap_vis.save(file_path)

  proportion_explained=pd.Series(None, index=axis_labels),
  proportion_explained=pd.Series(None, index=axis_labels),
  proportion_explained=pd.Series(None, index=axis_labels),
  proportion_explained=pd.Series(None, index=axis_labels),
  proportion_explained=pd.Series(None, index=axis_labels),
  proportion_explained=pd.Series(None, index=axis_labels),


### Supervised UMAP
First i'm going to try to use supervised UMAP with 'disease_type' as the target on samples that only have 1 disease. Then try to apply this to the whole dataset and see if we can still get good results.
Going to attempt to visualize in the 2D space first, then 3D space using emperor. 2D plots code will be taken from UMAP documentation.


In [16]:
#Filtering metadata to only get samples with 1 disease type, to remove disease ambiguity
filtered_metadata_df = metadata_df.drop('hispanic_origin', axis=1)

filtered_metadata_df = filtered_metadata_df[(filtered_metadata_df.sum(axis=1) == 1)] #We have around 300 samples left
filtered_metadata_df

Unnamed: 0_level_0,abdominal_obesity_ncep_v2,ckd_v2,diabetes2_v2,hypertension2_v2,precvd_v2,elevated_bp_selfmeds_v2,dyslipidemia_v2
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11666.G0003A,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0009A,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0014A,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11666.G0034A,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11666.G0037A,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
11666.G1696A,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11666.G1716A,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1745A,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11666.G1760A,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [34]:
metadata_df.drop('hispanic_origin', axis=1)
filtered_metadata_df[(filtered_metadata_df.sum(axis=1) == 2)] 

  filtered_metadata_df[(filtered_metadata_df.sum(axis=1) == 2)]


Unnamed: 0_level_0,abdominal_obesity_ncep_v2,ckd_v2,diabetes2_v2,hypertension2_v2,precvd_v2,elevated_bp_selfmeds_v2,dyslipidemia_v2,target_disease
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [17]:
#Convert binary columns into a single categorical target column for disease type
filtered_metadata_df["target_disease"] = filtered_metadata_df.idxmax(axis=1)

#Get the target disease and sample name,then create the metadata and filterd feature table
target_disease = filtered_metadata_df['target_disease']
target_disease.to_csv('data/temp/target_disease.tsv', sep="\t")
metadata_target_disease = Metadata.load("data/temp/target_disease.tsv") 
feature_table_target_disease = filter_samples(table, metadata = metadata_target_disease).filtered_table

In [18]:
#View as a DF to perform cleaning
feature_df_target_disease = feature_table_target_disease.view(pd.DataFrame)
feature_df_target_disease = feature_df_target_disease[feature_df_target_disease.columns[((feature_df_target_disease > 0).sum() > 3)]] #Dropping features that appear in less than 3 samples
feature_df_target_disease

Unnamed: 0,AACATAGGGGGCAAGCGTTGTCCGGAATCACTGGGCGTAAAGGGCGCGTAGGTGGTCTGTTAAGTCAGATGTGAAATGTAAGGGCTCAACCCTTAACGTGCATCTGATACTGGCAGACTTGAGTGCGGAAGAGGCAAGTGGAATTCCTAG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACCATAGGCTCAACCTATGGATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACCATGGGCTCAACCCATGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACCATGGGCTCAACCCGTGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTATGGGCTCAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTATGGGCTCAACTCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAAACTATGGGCTTAACCCATAAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAATCCATGGGCTCAACCCGTGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGAAGTGAAATCCATGGGCTTAACCCGTGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,AACGTAGGGTGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGACCGGCAAGTTGGGAGTGAAATCCATGGGCTCAACCCGTGAATTGCTTTCAAAACTGCTGGCCTTGAGTAGTGCAGAGGTAGGTGGAATTCCCGG,...,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGTATGGCAAGTCAGAAGTGAAAGGCTGGGGCTCAACCCCGGGACTGCTTTTGAAACTGTCAAACTAGAGTACAGGAGAGGAAAGCGGAATTCCTAG,TACGTATGGTGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGTGAGACAAGTCTGAAGTGAAAATCCGGGGCTTAACCCCGGAACTGCTTTGGAAACTGCCTGACTAGAGTACAGGAGAGGTAAGTGGAATTCCTAG,TACGTATGTCCCGAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGACGGTTGATTAAGTCTGATGTGAAAGCCCGGAGCTCAACTCCGGAATGGCATTGGAAACTGGTTAACTTGAGTGTTGTAGAGGTAAGTGGAACTCCATG,TACGTATGTCGCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGCGGCTTAGTAAGTCTGATGTGAAAATGCGGGGCTCAACCCCGTATTGCGTTGGAAACTGCTAAACTAGAGTACTGGAGAGGTAGGCGGAACTACAAGT,TACGTATGTCGCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGCGGTTTAGTAAGTCTGATGTGAAAATGCGGGGCTCAACCCCGTATTGCGTTGGAAACTGCTAAACTAGAGTACTGGAGAGGTAGGCGGAACTACAAGT,TACGTATGTCGCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGCGGTTTGGTAAGTCTGATGTGAAAATGCGGGGCTCAACTCCGTATTGCGTTGGAAACTGCCAAACTAGAGTACTGGAGAGGTGGGCGGAACTACAAGT,TACGTATGTCGCGAGCGTTATCCGGAATTATTGGGCATAAAGGGCATCTAGGCGGATATACAAGTCAGGGGTGAAAACTTAGGGCTCAACTCAAAGCTTGCCTTTGAAACTGTATATCTAGAGTGCTGGAGAGGTGGACGGAACTACACG,TACGTATGTTCCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGACGGTTATTTAAGTCTGAAGTGAAAGCCCTCAGCTCAACTGAGGAATTGCTTTGGAAACTGGATGACTTGAGTGCAGTAGAGGAAAGTGGAACTCCATG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGAACTCCGCGG,TACGTGGGTCGCAAGCGTTATCCGGAATCATTGGGCGTAAAGAGTGAGCAGGCGGTCCGGGAAGTCCGCGGTGAAATGCGGGGGCTCAACCCCCGCAGGCCGCGGATACTTCCGGTCTGGGGTGCGGGAGAGGCGGACGGAACTCCGCGT
11666.G0394A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0156A,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0343A,0.0,0.0,0.0,0.0,84.0,0.0,0.0,0.0,0.0,0.0,...,465.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G0103A,3.0,0.0,0.0,0.0,44.0,0.0,0.0,2.0,0.0,0.0,...,768.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
11666.G0366A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11666.G1477A,0.0,0.0,0.0,0.0,45.0,0.0,9.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1494A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1648A,0.0,0.0,0.0,4.0,4.0,0.0,0.0,1.0,0.0,0.0,...,134.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11666.G1650A,2.0,0.0,0.0,4.0,14.0,0.0,0.0,0.0,0.0,0.0,...,2.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#Convert target disease into categorical numbers
target_disease_dict = {}
for i in range(len(target_disease.unique())):
    target_disease_dict[target_disease.unique()[i]] = i

#
target_disease_map = target_disease.map(target_disease_dict)

In [21]:
#Create the UMAP pipeline, first do unsupervised regular UMAP, then compare with supervised UMAP

# embedding = umap.UMAP(n_neighbors=75, metric="jaccard").fit_transform(feature_df_target_disease, y=target_disease_map)
# fig, ax = plt.subplots(1, figsize=(9, 5))
# plt.scatter(*embedding.T, s=5, c=target_disease_map, cmap='Spectral', alpha=1.0)
# plt.setp(ax, xticks=[], yticks=[])
# cbar = plt.colorbar(boundaries=np.arange(6)-0.5)
# cbar.set_ticks(np.arange(5))
# cbar.set_ticklabels(target_disease_dict.keys())
# plt.title('UMAP');
# plt.savefig('umap.png')

  warn(


In [90]:
def umap_plot_unsupervised(feature_table, n_neighbors, n_components, metric):
    embedding = umap.UMAP(n_neighbors=n_neighbors, n_components= n_components,metric=metric, random_state=10).fit_transform(feature_table)
    fig, ax = plt.subplots(1, figsize=(9, 5))
    plt.scatter(*embedding.T, s=5, c=target_disease_map, cmap='viridis', alpha=1.0)
    plt.setp(ax, xticks=[], yticks=[])
    cbar = plt.colorbar(boundaries=np.arange(6)-0.5)
    cbar.set_ticks(np.arange(5))
    cbar.set_ticklabels(target_disease_dict.keys())
    plt.title('UMAP Plot with n_neighbors:{0}, {1} metric'.format(n_neighbors, metric))
    plt.savefig('umap_unsupervised.png')
    return embedding

embedding = umap_plot_unsupervised(feature_df_target_disease, target_disease_map, 75, 2, "jaccard")

In [31]:
def umap_plot_supervised(feature_table, target, n_neighbors, n_components, metric):
    embedding = umap.UMAP(n_neighbors=n_neighbors, n_components= n_components,metric=metric, random_state=10).fit_transform(feature_table, y=target)
    fig, ax = plt.subplots(1, figsize=(9, 5))
    plt.scatter(*embedding.T, s=5, c=target_disease_map, cmap='viridis', alpha=1.0)
    plt.setp(ax, xticks=[], yticks=[])
    cbar = plt.colorbar(boundaries=np.arange(6)-0.5)
    cbar.set_ticks(np.arange(5))
    cbar.set_ticklabels(target_disease_dict.keys())
    plt.title('UMAP Plot with n_neighbors:{0}, {1} metric'.format(n_neighbors, metric))
    plt.savefig('umap_supervised.png')
    return embedding

embedding = umap_plot_supervised(feature_df_target_disease, target_disease_map, 75, 2, "jaccard")

  warn(


In [None]:
# import numpy as np
# from mnist.loader import MNIST
# import matplotlib.pyplot as plt
# %matplotlib inline
# import seaborn as sns
# sns.set(style='white', context='poster')
# # mndata = MNIST('fashion-mnist/data/fashion')
# # train, train_labels = mndata.load_training()
# # test, test_labels = mndata.load_testing()
# # data = np.array(np.vstack([train, test]), dtype=np.float64) / 255.0
# # target = np.hstack([train_labels, test_labels])
# # classes = [
# #     'T-shirt/top',
# #     'Trouser',
# #     'Pullover',
# #     'Dress',
# #     'Coat',
# #     'Sandal',
# #     'Shirt',
# #     'Sneaker',
# #     'Bag',
# #     'Ankle boot']
