In [1]:
from glob import glob
from os import listdir, makedirs
from os.path import join
import pandas as pd
import numpy as np

In [2]:
group_sig_out_dir = './output/distance'

## Import data

Get a list of all the files in the output group significance directory

In [3]:
taxa = listdir(group_sig_out_dir)

Iterate over each of the output files, and import the group significance results

In [4]:
# grab tsv files
bact_dfs = []
for d in taxa:
    data_fp = join(group_sig_out_dir,
                   d,
                   'permanova',
                   'raw_data.tsv')
    
    d_df = pd.read_csv(data_fp, 
                       header=0,
                       sep='\t',
                       index_col=0)

    d_df['Bacterial_Genus'] = d
    
    bact_dfs.append(d_df)
    

# load group significance tsvs


Combine these results into a single dataframe.

In [5]:
combined_df = pd.concat(bact_dfs, ignore_index=True)

In [6]:
combined_df.head()

Unnamed: 0,SubjectID1,SubjectID2,Group1,Group2,Distance,Bacterial_Genus
0,65716.12173.91,65716.12173.92,chimp_TZA_wild_moeller,chimp_TZA_wild_moeller,1.0,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_...
1,65716.12173.92,80072.12821.G2MB7,chimp_TZA_wild_moeller,howler_CRI_wild_clayton,1.0,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_...
2,65716.12173.91,80072.12821.G2MB7,chimp_TZA_wild_moeller,howler_CRI_wild_clayton,1.0,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_...
3,80072.12821.G2MB7,65716.12173.92,howler_CRI_wild_clayton,chimp_TZA_wild_moeller,1.0,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_...
4,80072.12821.G2MB7,65716.12173.91,howler_CRI_wild_clayton,chimp_TZA_wild_moeller,1.0,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_...


## Aggregate distance values

Using the Pandas pivot table functionality, aggregate the individual dissimilarity comparisons by group using `np.mean`.

In [7]:
combined_pivot = combined_df.pivot_table(values='Distance',
                        index=['Group1', 'Group2'],
                        columns=['Bacterial_Genus'],
                        aggfunc=np.mean)

In [8]:
combined_pivot.reset_index(inplace=True)

In [9]:
combined_pivot.head()


Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.952381,,,0.47619,0.222222,0.844795,1.0,,...,1.0,0.626793,1.0,0.4,0.222222,0.684271,,,,0.0
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,,,,0.346599,0.507755,0.944939,0.975,,...,0.638889,0.794204,0.779603,0.211348,0.507755,0.714956,,,,0.012658
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,1.0,,,0.625661,0.610979,0.966638,,,...,1.0,0.866128,0.878175,1.0,0.610979,1.0,,,,1.0
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,,0.619048,0.685714,0.985043,,,...,,0.893772,0.95,1.0,0.685714,1.0,,,,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,,0.699134,1.0,0.998055,,,...,,0.995097,1.0,1.0,1.0,1.0,,,,


## Load target group comparisons

This file should list just the specific group comparisons we want to look at.

In [10]:
target_grp_df = pd.read_csv('./data/host_specificity_score_populations_american_gut.txt', sep='\t')

In [11]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or captive to wild) as additional columns

In [12]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_dir'] = 'wild'

Remove non-target comparisons from the table

In [13]:
combined_pivot_filtered = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [14]:
combined_pivot_filtered

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
24,chimp_USA_captive_campbell,chimp_DRC_wild_campbell,1.0,,,0.625661,0.610979,0.966638,,,...,0.878175,1.0,0.610979,1.0,,,,1.0,3,wild
35,chimp_USA_captive_campbell,human_USA,,,,0.018519,1.0,0.998805,,,...,0.980938,,1.0,,0.298246,,,1.0,3,human
37,chimp_USA_captive_moeller,chimp_TZA_wild_moeller,,,,0.928571,0.513061,0.940518,,0.210526,...,0.921071,1.0,0.513061,0.926907,0.666667,,0.210526,,2,wild
47,chimp_USA_captive_moeller,human_USA,,,,0.0,1.0,0.998232,,,...,0.965,,1.0,,0.376218,,,,2,human
54,douc_SGP_captive_clayton,douc_VNM_wild_clayton,,,,,1.0,0.981906,,,...,0.831372,,1.0,,0.877683,,,,1,wild
59,douc_SGP_captive_clayton,human_USA,,,1.0,0.212121,1.0,0.992915,,,...,1.0,,1.0,,0.385526,,,,1,human
66,douc_VNM_semicaptive_clayton,douc_VNM_wild_clayton,,,,,1.0,0.905775,,,...,0.730686,,1.0,,0.321829,,,,6,wild
71,douc_VNM_semicaptive_clayton,human_USA,,,,0.875,1.0,1.0,,,...,1.0,,1.0,,0.961014,,,,6,human
103,gorilla_USA_captive_campbell,gorilla_DRC_wild_campbell,,,,0.833333,0.840678,0.992406,,,...,1.0,,0.840678,1.0,,,,,5,wild
107,gorilla_USA_captive_campbell,human_USA,,,,0.0,1.0,0.994426,,,...,0.984683,,1.0,,0.532164,,,,5,human


## Calculate HSS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [15]:
combined_pivot_filtered_t = combined_pivot_filtered.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose()

In [16]:
combined_pivot_filtered_t.head()

comparison_num,3,3,2,2,1,1,6,6,5,5,4,4
comparison_dir,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Unassigned_____,1.0,,,,,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,1.0,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,0.625661,0.018519,0.928571,0.0,,0.212121,,0.875,0.833333,0.0,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,0.610979,1.0,0.513061,1.0,1.0,1.0,1.0,1.0,0.840678,1.0,,


In [17]:
target_grps = target_grp_df.copy()

In [18]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [19]:
HSS_df = pd.DataFrame(index=combined_pivot_filtered_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    HSS = combined_pivot_filtered_t.loc[:,(i,
                     'human')]/combined_pivot_filtered_t.loc[:,(i,
                                          'wild')]
    HSS_df[i] = HSS

KeyError: (0, 'human')

In [None]:
HSS_df.head()

Make this pretty; add back in the comparison names

In [None]:
a = HSS_df.transpose()
a

In [None]:
target_grp_df.columns

In [None]:
final_table = a.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [None]:
final_table.head()

Write to file

In [None]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_table.to_csv(join(outdir, 'hss_american_gut.txt'), sep='\t')