In [None]:
# Import libraries
import pandas as pd
from skbio.stats.ordination import pcoa
import plotly.express as px

In [None]:
def bray_beta_diversity_clean(directory):
    """function that cleans data and generates final data frame for bray-curtis pcoa beta diversity plots"""
    # Load and clean metadata
    metadata = pd.read_csv(directory+'/metadata.tsv', sep='\t')
    metadata = metadata.drop(metadata.index[0])
    metadata = metadata.reset_index(drop=True)
    
    # Bray Curtis
    bray = pd.read_csv(directory+'/bray-distance-matrix.tsv', sep='\t', index_col=0)
    bray_different_index = pd.read_csv(directory+'/bray-distance-matrix.tsv', sep='\t')
    bray_distance_matrix = bray.rename_axis('sample-id', axis="columns")
    
    # PCOA Analysis and result matrix
    bray_pcoa = pcoa(bray_distance_matrix);
    bray_final_results = bray_pcoa.samples
    bray_final_results = bray_final_results.reset_index()
    
    # Cleaning sample ids to join to result matrix
    bray_sample_id = bray_different_index[['Unnamed: 0']]
    bray_sample_id = bray_sample_id.rename(columns={'Unnamed: 0':'sample-id'})
    bray_sample_id = bray_sample_id.reset_index()
    
    # Constructing final matrix with PCoA results AND metadata
    bray_pcoa_results = bray_sample_id.join(bray_final_results, on='index', lsuffix='l',rsuffix='r')
    bray_pcoa_results = bray_pcoa_results.drop(labels = ['indexl','indexr'],axis=1)
    bray_metadata_to_plot = metadata.merge(bray_pcoa_results,how='outer')
    return bray_metadata_to_plot

In [None]:
def unifrac_beta_diversity_clean(directory):
    """function that cleans data and generates final data frame for weighted unifrac pcoa beta diversity plots"""
    # Load and clean metadata
    metadata = pd.read_csv(directory+'/metadata.tsv', sep='\t')
    metadata = metadata.drop(metadata.index[0])
    metadata = metadata.reset_index(drop=True)
    
    # Weighted Unifrac
    unifrac = pd.read_csv(directory+'/unifrac-distance-matrix.tsv', sep='\t',index_col=0)
    unifrac_different_index = pd.read_csv(directory+'/unifrac-distance-matrix.tsv', sep='\t')
    unifrac_distance_matrix = unifrac.rename_axis('sample-id',axis='columns')
    
    # PCOA Analysis and result matrix
    unifrac_pcoa = pcoa(unifrac_distance_matrix);
    unifrac_final_results = unifrac_pcoa.samples
    unifrac_final_results = unifrac_final_results.reset_index()
    
    # Cleaning sample ids to join to result matrix
    unifrac_sample_id = unifrac_different_index[['Unnamed: 0']]
    unifrac_sample_id = unifrac_sample_id.rename(columns={'Unnamed: 0':'sample-id'})
    unifrac_sample_id = unifrac_sample_id.reset_index()
    
    # Constructing final matrix with PCoA Results AND metadata
    unifrac_pcoa_results = unifrac_sample_id.join(unifrac_final_results, on='index', lsuffix='l',rsuffix='r')
    unifrac_pcoa_results = unifrac_pcoa_results.drop(labels = ['indexl','indexr'],axis=1)
    unifrac_metadata_to_plot = metadata.merge(unifrac_pcoa_results,how='outer')
    return unifrac_metadata_to_plot

In [None]:
bray_metadata_to_plot=bray_beta_diversity_clean()

In [None]:
unifrac_metadata_to_plot=unifrac_beta_diversity_clean()

In [None]:
def test_bray_beta_diversity_clean_1():
    """test function to check output is of type pandas dataframe"""
    bray_metadata_to_plot=bray_beta_diversity_clean('data')
    assert isinstance(bray_metadata_to_plot,pd.core.frame.DataFrame),"not returning df"

def test_bray_beta_diversity_clean_2():
    """test function to check that Unnamed: 0 column has been removed from final DF"""
    try:
        bray_metadata_to_plot=bray_beta_diversity_clean('data')
        bray_metadata_to_plot['Unnamed: 0']
    except Exception as e:
        assert isinstance(e, 'KeyError'), "unnamed column still present"
        # should throw key error since unnamed column was removed
        
def test_bray_beta_diversity_clean_3():
    """test function to check that metadata and PC results were merged/both present in output"""
    bray_metadata_to_plot=bray_beta_diversity_clean('data')
    if 'body-site' in bray_metadata_to_plot.columns:
        result = 1
    if 'PC1' in bray_metadata_to_plot.columns:
        result = 2
    assert result == 2, "dataframe merge issues: missing metadata OR PC1"
    # output df should contain columns from both the PCOA analysis and metadata

In [None]:
def test_unifrac_beta_diversity_clean_1():
    """test function to check output is of type pandas dataframe"""
    unifrac_metadata_to_plot=unifrac_beta_diversity_clean('data')
    assert isinstance(unifrac_metadata_to_plot,pd.core.frame.DataFrame),"not returning df"

def test_unifrac_beta_diversity_clean_2():
    """test function to check that Unnamed: 0 column has been removed from final DF"""
    try:
        unifrac_metadata_to_plot=unifrac_beta_diversity_clean('data')
        unifrac_metadata_to_plot['Unnamed: 0']
    except Exception as e:
        assert isinstance(e, 'KeyError'), "unnamed column still present"
        # should throw key error since unnamed column was removed
        
def test_unifrac_beta_diversity_clean_3():
    """test function to check that metadata and PC results were merged/both present in output"""
    unifrac_metadata_to_plot=unifrac_beta_diversity_clean('data')
    if 'body-site' and 'PC1' in unifrac_metadata_to_plot.columns:
        result = 1
    else:
        result = 0
    assert result == 1, "dataframe merge issues: missing metadata OR PC1"
    # output df should contain columns from both the PCOA analysis and metadata
    
def test_unifrac_beta_diversity_clean_4():
    """test function to check that the metadata is the same between both beta diversity functions"""
    bray_metadata_to_plot=bray_beta_diversity_clean('data')
    unifrac_metadata_to_plot=unifrac_beta_diversity_clean('data')
    assert unifrac_metadata_to_plot['body-site'].all() == bray_metadata_to_plot['body-site'].all()
    
def test_unifrac_beta_diversity_clean_5():
    """test function to check that beta diversity metric PCOA analyses are not identical"""
    bray_metadata_to_plot=bray_beta_diversity_clean('data')
    unifrac_metadata_to_plot=unifrac_beta_diversity_clean('data')
    false = bray_metadata_to_plot.iloc[1,9] == unifrac_metadata_to_plot.iloc[1,9]
    assert false == False, "bray curtis equals weighted unifrac"

In [None]:
fig = px.scatter_3d(bray_metadata_to_plot, x='PC1', y='PC2', z='PC3', color='body-site')
fig.show()

In [None]:
fig = px.scatter_3d(unifrac_metadata_to_plot, x='PC1', y='PC2', z='PC3', color='body-site')
fig.show()