In [1]:
from glob import glob
from os import listdir, makedirs
from os.path import join
import pandas as pd
import numpy as np

In [2]:
group_sig_out_dir = './output/distance'

## Import data

Get a list of all the files in the output group significance directory

In [3]:
taxa = listdir(group_sig_out_dir)

Iterate over each of the output files, and import the group significance results

In [4]:
# grab tsv files
bact_dfs = []
for d in taxa:
    data_fp = join(group_sig_out_dir,
                   d,
                   'permanova',
                   'raw_data.tsv')
    
    d_df = pd.read_csv(data_fp, 
                       header=0,
                       sep='\t',
                       index_col=0)

    d_df['Bacterial_Genus'] = d
    
    bact_dfs.append(d_df)
    

# load group significance tsvs


Combine these results into a single dataframe.

In [5]:
combined_df = pd.concat(bact_dfs, ignore_index=True)

In [6]:
combined_df.head()

Unnamed: 0,SubjectID1,SubjectID2,Group1,Group2,Distance,Bacterial_Genus
0,100030.13393.SRR8978344,100030.13393.SRR8978339,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,1.0,k__Bacteria_p___c___o___f___g__
1,100030.13393.SRR8978339,65716.12173.133,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__
2,100030.13393.SRR8978339,65716.12173.104,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.5,k__Bacteria_p___c___o___f___g__
3,100030.13393.SRR8978339,65716.12173.144,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__
4,100030.13393.SRR8978339,65716.12173.150,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__


## Aggregate distance values

Using the Pandas pivot table functionality, aggregate the individual dissimilarity comparisons by group using `np.mean`.

In [7]:
combined_pivot = combined_df.pivot_table(values='Distance',
                        index=['Group1', 'Group2'],
                        columns=['Bacterial_Genus'],
                        aggfunc=np.mean)

In [8]:
combined_pivot.reset_index(inplace=True)

In [9]:
combined_pivot.head()


Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__Anaeroplasma,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.920635,,,,0.2,0.846254,0.333333,1.0,...,,,,0.632296,1.0,,0.135761,0.646717,0.553968,1.0
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,,,0.197619,0.349583,0.962346,0.852381,,...,,,,0.795701,0.78466,,0.430928,0.705881,0.405233,0.821429
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,0.928571,,,0.285185,0.562698,0.969989,,,...,,,,0.873083,0.855688,,1.0,1.0,0.847937,
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,,0.217949,0.561645,0.983341,,0.510417,...,,,,0.895034,0.896667,,1.0,1.0,0.839608,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,,,,1.0,,0.5,...,,,,0.991665,1.0,,,1.0,,


## Load target group comparisons

This file should list just the specific group comparisons we want to look at.

In [10]:
target_grp_df = pd.read_csv('./data/host_specificity_score_populations.txt', sep='\t')

In [11]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or captive to wild) as additional columns

In [12]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_dir'] = 'wild'

Remove non-target comparisons from the table

In [13]:
combined_pivot_filtered = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [14]:
combined_pivot_filtered

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
26,chimp_USA_captive_campbell,chimp_DRC_wild_campbell,0.928571,,,0.285185,0.562698,0.969989,,,...,,0.873083,0.855688,,1.0,1.0,0.847937,,3,wild
38,chimp_USA_captive_campbell,human_USA,,,1.0,,0.852998,0.999667,,,...,,0.980116,1.0,,0.929211,,0.759744,,3,human
40,chimp_USA_captive_moeller,chimp_TZA_wild_moeller,,0.619048,1.0,0.315201,0.477241,0.952806,,,...,,0.883538,0.871956,0.947386,1.0,0.917548,0.960811,,2,wild
51,chimp_USA_captive_moeller,human_USA,,,1.0,,0.826263,0.998939,,0.11875,...,,0.980034,0.996107,,0.929211,,0.894608,,2,human
59,douc_SGP_captive_clayton,douc_VNM_wild_clayton,,,,,,0.975372,,,...,,0.963201,0.675744,,,,,,1,wild
64,douc_SGP_captive_clayton,human_USA,,,1.0,,,0.993318,,0.1,...,,0.993883,1.0,,,,,,1,human
72,douc_USA_captive_clayton,douc_VNM_wild_clayton,,,,,1.0,1.0,,,...,,0.998979,1.0,,,,,,0,wild
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.981047,,,...,,1.0,1.0,,,,0.552083,,0,human
85,douc_VNM_semicaptive_clayton,douc_VNM_wild_clayton,,,,,1.0,0.873591,,,...,,0.802844,0.679183,,0.0,,,,6,wild
90,douc_VNM_semicaptive_clayton,human_USA,,,1.0,,1.0,1.0,,,...,,0.995633,1.0,,1.0,,,,6,human


## Calculate HSS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [15]:
combined_pivot_filtered_t = combined_pivot_filtered.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose()

In [16]:
combined_pivot_filtered_t.head()

comparison_num,3,3,2,2,1,1,0,0,6,6,5,5,4,4
comparison_dir,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Unassigned_____,0.928571,,,,,,,,,,0.907407,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,0.619048,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,1.0,1.0,1.0,,1.0,,,,1.0,,1.0,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,0.285185,,0.315201,,,,,,,,0.358222,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,0.562698,0.852998,0.477241,0.826263,,,1.0,1.0,1.0,1.0,0.796349,0.89037,,


In [17]:
target_grps = target_grp_df.copy()

In [18]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [19]:
HSS_df = pd.DataFrame(index=combined_pivot_filtered_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    HSS = combined_pivot_filtered_t.loc[:,(i,
                     'human')]/combined_pivot_filtered_t.loc[:,(i,
                                          'wild')]
    HSS_df[i] = HSS

In [20]:
HSS_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bacterial_Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.731331,1.515907,,1.118065,1.0


Make this pretty; add back in the comparison names

In [21]:
a = HSS_df.transpose()
a

Bacterial_Genus,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,k__Bacteria_p__Acidobacteria_c__[Chloracidobacteria]_o__RB41_f__Ellin6075_g__,k__Bacteria_p__Actinobacteria____,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__Anaeroplasma,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__
0,,,,,1.0,0.981047,,,,,...,,,,1.001022,1.0,,,,,
1,,,,,,1.018399,,,,,...,,,,1.031855,1.47985,,,,,
2,,,1.0,,1.731331,1.048418,,,,0.208661,...,0.847907,,,1.109215,1.142383,,0.929211,,0.931096,
3,,,,,1.515907,1.030596,,,,inf,...,,,,1.122592,1.16865,,0.929211,,0.895992,
4,,,,,,1.181017,,inf,,0.04,...,,,,1.084983,,,5.142857,,,
5,,,,,1.118065,1.009731,,,,0.960584,...,,,,1.158052,1.0,,1.011626,,0.729062,
6,,,,,1.0,1.144701,,,,,...,2.139109,,,1.240132,1.472357,,inf,,,


In [22]:
target_grp_df.columns

Index(['captive_population', 'human_population', 'wild_population'], dtype='object')

In [23]:
final_table = a.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [24]:
final_table.head()

captive_population,douc_USA_captive_clayton,douc_SGP_captive_clayton,chimp_USA_captive_moeller,chimp_USA_captive_campbell,howler_CRI_captive_clayton,gorilla_USA_captive_campbell,douc_VNM_semicaptive_clayton
human_population,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA
wild_population,douc_VNM_wild_clayton,douc_VNM_wild_clayton,chimp_TZA_wild_moeller,chimp_DRC_wild_campbell,howler_CRI_wild_clayton,gorilla_DRC_wild_campbell,douc_VNM_wild_clayton
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.731331,1.515907,,1.118065,1.0


Write to file

In [25]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_table.to_csv(join(outdir, 'hss.txt'), sep='\t')