In [1]:
from glob import glob
from os import listdir, makedirs
from os.path import join
import pandas as pd
import numpy as np

# Data set 1: Yatsunenko et al

In [2]:
group_sig_out_dir = './output/distance'

## Import data

Get a list of all the files in the output group significance directory

In [3]:
taxa = listdir(group_sig_out_dir)

Iterate over each of the output files, and import the group significance results

In [4]:
# grab tsv files
bact_dfs = []
for d in taxa:
    data_fp = join(group_sig_out_dir,
                   d,
                   'permanova',
                   'raw_data.tsv')
    
    d_df = pd.read_csv(data_fp, 
                       header=0,
                       sep='\t',
                       index_col=0)

    d_df['Bacterial_Genus'] = d
    
    bact_dfs.append(d_df)
    

# load group significance tsvs


Combine these results into a single dataframe.

In [5]:
combined_df = pd.concat(bact_dfs, ignore_index=True)

In [6]:
combined_df.head()

Unnamed: 0,SubjectID1,SubjectID2,Group1,Group2,Distance,Bacterial_Genus
0,100030.13393.SRR8978344,100030.13393.SRR8978339,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,1.0,k__Bacteria_p___c___o___f___g__
1,100030.13393.SRR8978339,65716.12173.133,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__
2,100030.13393.SRR8978339,65716.12173.104,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.5,k__Bacteria_p___c___o___f___g__
3,100030.13393.SRR8978339,65716.12173.144,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__
4,100030.13393.SRR8978339,65716.12173.150,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,k__Bacteria_p___c___o___f___g__


## Aggregate distance values

Using the Pandas pivot table functionality, aggregate the individual dissimilarity comparisons by group using `np.mean`.

In [7]:
combined_pivot = combined_df.pivot_table(values='Distance',
                        index=['Group1', 'Group2'],
                        columns=['Bacterial_Genus'],
                        aggfunc=np.mean)

In [8]:
combined_pivot.reset_index(inplace=True)

In [9]:
combined_pivot.head()


Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__Anaeroplasma,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.920635,,,,0.2,0.846254,0.333333,1.0,...,,,,0.632296,1.0,,0.135761,0.646717,0.553968,1.0
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,,,0.197619,0.349583,0.962346,0.852381,,...,,,,0.795701,0.78466,,0.430928,0.705881,0.405233,0.821429
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,0.928571,,,0.285185,0.562698,0.969989,,,...,,,,0.873083,0.855688,,1.0,1.0,0.847937,
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,,0.217949,0.561645,0.983341,,0.510417,...,,,,0.895034,0.896667,,1.0,1.0,0.839608,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,,,,1.0,,0.5,...,,,,0.991665,1.0,,,1.0,,


## Load target group comparisons

This file should list just the specific group comparisons we want to look at.

In [10]:
target_grp_df = pd.read_csv('./data/host_specificity_score_populations.txt', sep='\t')

In [11]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or captive to wild) as additional columns

In [12]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_dir'] = 'wild'

Remove non-target comparisons from the table

In [13]:
combined_pivot_filtered = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [14]:
combined_pivot_filtered

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
26,chimp_USA_captive_campbell,chimp_DRC_wild_campbell,0.928571,,,0.285185,0.562698,0.969989,,,...,,0.873083,0.855688,,1.0,1.0,0.847937,,3,wild
38,chimp_USA_captive_campbell,human_USA,,,1.0,,0.852998,0.999667,,,...,,0.980116,1.0,,0.929211,,0.759744,,3,human
40,chimp_USA_captive_moeller,chimp_TZA_wild_moeller,,0.619048,1.0,0.315201,0.477241,0.952806,,,...,,0.883538,0.871956,0.947386,1.0,0.917548,0.960811,,2,wild
51,chimp_USA_captive_moeller,human_USA,,,1.0,,0.826263,0.998939,,0.11875,...,,0.980034,0.996107,,0.929211,,0.894608,,2,human
59,douc_SGP_captive_clayton,douc_VNM_wild_clayton,,,,,,0.975372,,,...,,0.963201,0.675744,,,,,,1,wild
64,douc_SGP_captive_clayton,human_USA,,,1.0,,,0.993318,,0.1,...,,0.993883,1.0,,,,,,1,human
72,douc_USA_captive_clayton,douc_VNM_wild_clayton,,,,,1.0,1.0,,,...,,0.998979,1.0,,,,,,0,wild
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.981047,,,...,,1.0,1.0,,,,0.552083,,0,human
85,douc_VNM_semicaptive_clayton,douc_VNM_wild_clayton,,,,,1.0,0.873591,,,...,,0.802844,0.679183,,0.0,,,,6,wild
90,douc_VNM_semicaptive_clayton,human_USA,,,1.0,,1.0,1.0,,,...,,0.995633,1.0,,1.0,,,,6,human


## Calculate HSS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [15]:
combined_pivot_filtered_t = combined_pivot_filtered.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose()

In [16]:
combined_pivot_filtered_t.head()

comparison_num,3,3,2,2,1,1,0,0,6,6,5,5,4,4
comparison_dir,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Unassigned_____,0.928571,,,,,,,,,,0.907407,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,0.619048,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,1.0,1.0,1.0,,1.0,,,,1.0,,1.0,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,0.285185,,0.315201,,,,,,,,0.358222,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,0.562698,0.852998,0.477241,0.826263,,,1.0,1.0,1.0,1.0,0.796349,0.89037,,


In [17]:
target_grps = target_grp_df.copy()

In [18]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [19]:
HSS_df = pd.DataFrame(index=combined_pivot_filtered_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    HSS = combined_pivot_filtered_t.loc[:,(i,
                     'human')]/combined_pivot_filtered_t.loc[:,(i,
                                          'wild')]
    HSS_df[i] = HSS

In [20]:
HSS_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bacterial_Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.731331,1.515907,,1.118065,1.0


Make this pretty; add back in the comparison names

In [21]:
HSS_df_t = HSS_df.transpose()
HSS_df_t

Bacterial_Genus,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,k__Bacteria_p__Acidobacteria_c__[Chloracidobacteria]_o__RB41_f__Ellin6075_g__,k__Bacteria_p__Actinobacteria____,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__Anaeroplasma,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__
0,,,,,1.0,0.981047,,,,,...,,,,1.001022,1.0,,,,,
1,,,,,,1.018399,,,,,...,,,,1.031855,1.47985,,,,,
2,,,1.0,,1.731331,1.048418,,,,0.208661,...,0.847907,,,1.109215,1.142383,,0.929211,,0.931096,
3,,,,,1.515907,1.030596,,,,inf,...,,,,1.122592,1.16865,,0.929211,,0.895992,
4,,,,,,1.181017,,inf,,0.04,...,,,,1.084983,,,5.142857,,,
5,,,,,1.118065,1.009731,,,,0.960584,...,,,,1.158052,1.0,,1.011626,,0.729062,
6,,,,,1.0,1.144701,,,,,...,2.139109,,,1.240132,1.472357,,inf,,,


In [22]:
target_grp_df.columns

Index(['captive_population', 'human_population', 'wild_population'], dtype='object')

In [23]:
final_HSS_table = HSS_df_t.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [24]:
final_HSS_table.head()

captive_population,douc_USA_captive_clayton,douc_SGP_captive_clayton,chimp_USA_captive_moeller,chimp_USA_captive_campbell,howler_CRI_captive_clayton,gorilla_USA_captive_campbell,douc_VNM_semicaptive_clayton
human_population,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA
wild_population,douc_VNM_wild_clayton,douc_VNM_wild_clayton,chimp_TZA_wild_moeller,chimp_DRC_wild_campbell,howler_CRI_wild_clayton,gorilla_DRC_wild_campbell,douc_VNM_wild_clayton
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.731331,1.515907,,1.118065,1.0


Write to file

In [25]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_HSS_table.to_csv(join(outdir, 'hss-yatsunenko.txt'), sep='\t')

## Calculate Microbiota Convergence Score

The MCS is the ratio of [captive vs human] / [wild vs human]

## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or human to wild) as additional columns

In [26]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

In [27]:
combined_pivot.head()

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.920635,,,,0.2,0.846254,0.333333,1.0,...,,0.632296,1.0,,0.135761,0.646717,0.553968,1.0,,
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,1.0,,,0.197619,0.349583,0.962346,0.852381,,...,,0.795701,0.78466,,0.430928,0.705881,0.405233,0.821429,,
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,0.928571,,,0.285185,0.562698,0.969989,,,...,,0.873083,0.855688,,1.0,1.0,0.847937,,,
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,,0.217949,0.561645,0.983341,,0.510417,...,,0.895034,0.896667,,1.0,1.0,0.839608,,,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,,,,1.0,,0.5,...,,0.991665,1.0,,,1.0,,,,


In [28]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


In [29]:
for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'captive-human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['wild_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['wild_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'wild-human'

In [30]:
combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,0.711111,0.997188,,0.55,...,,0.977664,1.0,,1.0,,0.948148,,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,1.0,,0.821065,0.996487,,,...,,0.983282,0.99087,,1.0,,1.0,,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,1.0,,0.852998,0.999667,,,...,,0.980116,1.0,,0.929211,,0.759744,,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,1.0,,0.826263,0.998939,,0.11875,...,,0.980034,0.996107,,0.929211,,0.894608,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,1.0,,,0.993318,,0.1,...,,0.993883,1.0,,,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.981047,,,...,,1.0,1.0,,,,0.552083,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,1.0,,1.0,1.0,,,...,,0.995633,1.0,,1.0,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,,...,,0.993552,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999783,,1.0,...,,0.985904,1.0,,0.997094,,0.968137,,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,1.0,,0.89037,0.99933,,,...,,0.985929,1.0,,0.929331,,0.628571,,5,captive-human


Remove non-target comparisons from the table

In [31]:
combined_pivot_filtered_MCS = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [32]:
combined_pivot_filtered_MCS

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,0.711111,0.997188,,0.55,...,,0.977664,1.0,,1.0,,0.948148,,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,1.0,,0.821065,0.996487,,,...,,0.983282,0.99087,,1.0,,1.0,,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,1.0,,0.852998,0.999667,,,...,,0.980116,1.0,,0.929211,,0.759744,,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,1.0,,0.826263,0.998939,,0.11875,...,,0.980034,0.996107,,0.929211,,0.894608,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,1.0,,,0.993318,,0.1,...,,0.993883,1.0,,,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.981047,,,...,,1.0,1.0,,,,0.552083,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,1.0,,1.0,1.0,,,...,,0.995633,1.0,,1.0,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,,...,,0.993552,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999783,,1.0,...,,0.985904,1.0,,0.997094,,0.968137,,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,1.0,,0.89037,0.99933,,,...,,0.985929,1.0,,0.929331,,0.628571,,5,captive-human


## Calculate MCS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [33]:
combined_pivot_filtered_MCS

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,0.711111,0.997188,,0.55,...,,0.977664,1.0,,1.0,,0.948148,,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,1.0,,0.821065,0.996487,,,...,,0.983282,0.99087,,1.0,,1.0,,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,1.0,,0.852998,0.999667,,,...,,0.980116,1.0,,0.929211,,0.759744,,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,1.0,,0.826263,0.998939,,0.11875,...,,0.980034,0.996107,,0.929211,,0.894608,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,1.0,,,0.993318,,0.1,...,,0.993883,1.0,,,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.981047,,,...,,1.0,1.0,,,,0.552083,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,1.0,,1.0,1.0,,,...,,0.995633,1.0,,1.0,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,,...,,0.993552,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999783,,1.0,...,,0.985904,1.0,,0.997094,,0.968137,,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,1.0,,0.89037,0.99933,,,...,,0.985929,1.0,,0.929331,,0.628571,,5,captive-human


Because we are re-using the Douc-Wild comparisons, we need to manually replicate those rows in the data frame.

In [38]:
douc_0_wild = combined_pivot_filtered_MCS.loc[
        (combined_pivot_filtered_MCS['comparison_num'] == 6) &
        (combined_pivot_filtered_MCS['comparison_dir'] == 'wild-human')].reset_index(drop=True)

In [39]:
douc_0_wild

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
0,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,,...,,0.993552,1.0,,1.0,,,,6,wild-human


In [40]:
douc_0_wild.loc[douc_0_wild.index[0],
                'comparison_num'] = 0

In [41]:
douc_1_wild = combined_pivot_filtered_MCS.loc[
        (combined_pivot_filtered_MCS['comparison_num'] == 6) &
        (combined_pivot_filtered_MCS['comparison_dir'] == 'wild-human')].reset_index(drop=True)

In [42]:
douc_1_wild.loc[douc_0_wild.index[0],
                'comparison_num'] = 1

In [43]:
combined_pivot_filtered_MCS = combined_pivot_filtered_MCS.append([douc_0_wild, douc_1_wild]).reset_index(drop=True)

In [44]:
combined_pivot_filtered_MCS_t = combined_pivot_filtered_MCS.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose().sort_index(axis=1)

In [45]:
combined_pivot_filtered_MCS_t.head()

comparison_num,0,0,1,1,2,2,3,3,4,4,5,5,6,6
comparison_dir,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Unassigned_____,,,,,,,,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,1.0,1.0,1.0,,,,1.0,,1.0,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,1.0,,1.0,0.826263,0.821065,0.852998,0.711111,,,0.89037,1.0,1.0,1.0


In [46]:
target_grps = target_grp_df.copy()

In [47]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [48]:
target_grps

Unnamed: 0,captive_population
0,douc_USA_captive_clayton
1,douc_SGP_captive_clayton
2,chimp_USA_captive_moeller
3,chimp_USA_captive_campbell
4,howler_CRI_captive_clayton
5,gorilla_USA_captive_campbell
6,douc_VNM_semicaptive_clayton


In [50]:
MCS_df = pd.DataFrame(index=combined_pivot_filtered_MCS_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    print(row)
    print(i)
    MCS = combined_pivot_filtered_MCS_t.loc[:,(i,
                     'captive-human')]/combined_pivot_filtered_MCS_t.loc[:,(i,
                                          'wild-human')]
    MCS_df[i] = MCS

captive_population    douc_USA_captive_clayton
Name: 0, dtype: object
0
captive_population    douc_SGP_captive_clayton
Name: 1, dtype: object
1
captive_population    chimp_USA_captive_moeller
Name: 2, dtype: object
2
captive_population    chimp_USA_captive_campbell
Name: 3, dtype: object
3
captive_population    howler_CRI_captive_clayton
Name: 4, dtype: object
4
captive_population    gorilla_USA_captive_campbell
Name: 5, dtype: object
5
captive_population    douc_VNM_semicaptive_clayton
Name: 6, dtype: object
6


In [51]:
MCS_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bacterial_Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.006331,1.199529,,0.89037,1.0


Make this pretty; add back in the comparison names

In [52]:
MCS_df_t = MCS_df.transpose()
MCS_df_t

Bacterial_Genus,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteriia_o__Acidobacteriales_f__Acidobacteriaceae_g__,k__Bacteria_p__Acidobacteria_c__[Chloracidobacteria]_o__RB41_f__Ellin6075_g__,k__Bacteria_p__Actinobacteria____,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Anaeroplasmatales_f__Anaeroplasmataceae_g__Anaeroplasma,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p___c___o___f___g__
0,,,,,1.0,0.981047,,,,,...,,,,1.006489,1.0,,,,,
1,,,,,,0.993318,,,,,...,,,,1.000333,1.0,,,,,
2,,,1.0,,1.006331,1.002461,,,,0.201355,...,0.744186,,,0.996696,1.005285,,0.929211,,0.894608,
3,,,,,1.199529,1.002486,,,,1.0,...,,,,1.002508,1.0,,0.929211,,0.801293,
4,,,,,,1.0,,1.0,,0.039735,...,,,,1.003621,,,1.0,,,
5,,,,,0.89037,0.999547,,,,0.831858,...,,,,1.000026,1.0,,0.932039,,0.649259,
6,,,,,1.0,1.0,,,,,...,1.0,,,1.002094,1.0,,1.0,,,


In [53]:
target_grp_df.columns

Index(['captive_population', 'human_population', 'wild_population'], dtype='object')

In [54]:
final_MCS_table = MCS_df_t.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [55]:
final_MCS_table.head()

captive_population,douc_USA_captive_clayton,douc_SGP_captive_clayton,chimp_USA_captive_moeller,chimp_USA_captive_campbell,howler_CRI_captive_clayton,gorilla_USA_captive_campbell,douc_VNM_semicaptive_clayton
human_population,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA
wild_population,douc_VNM_wild_clayton,douc_VNM_wild_clayton,chimp_TZA_wild_moeller,chimp_DRC_wild_campbell,howler_CRI_wild_clayton,gorilla_DRC_wild_campbell,douc_VNM_wild_clayton
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,1.0,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.006331,1.199529,,0.89037,1.0


Write to file

In [56]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_MCS_table.to_csv(join(outdir, 'mcs-yatsunenko.txt'), sep='\t')

# Data set 2: American Gut

In [83]:
group_sig_out_dir = './output/distance-amgut'

## Import data

Get a list of all the files in the output group significance directory

In [84]:
taxa = listdir(group_sig_out_dir)

Iterate over each of the output files, and import the group significance results

In [85]:
# grab tsv files
bact_dfs = []
for d in taxa:
    data_fp = join(group_sig_out_dir,
                   d,
                   'permanova',
                   'raw_data.tsv')
    
    d_df = pd.read_csv(data_fp, 
                       header=0,
                       sep='\t',
                       index_col=0)

    d_df['Bacterial_Genus'] = d
    
    bact_dfs.append(d_df)
    

# load group significance tsvs


Combine these results into a single dataframe.

In [86]:
combined_df = pd.concat(bact_dfs, ignore_index=True)

In [87]:
combined_df.head()

Unnamed: 0,SubjectID1,SubjectID2,Group1,Group2,Distance,Bacterial_Genus
0,100030.13393.SRR8978339,100030.13393.SRR8978285,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.0,k__Bacteria_p___c___o___f___g__
1,100030.13393.SRR8978285,65716.12173.14,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.0,k__Bacteria_p___c___o___f___g__
2,100030.13393.SRR8978285,65716.12173.35,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.0,k__Bacteria_p___c___o___f___g__
3,100030.13393.SRR8978285,65716.12173.135,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.0,k__Bacteria_p___c___o___f___g__
4,100030.13393.SRR8978285,65716.12173.26,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,0.0,k__Bacteria_p___c___o___f___g__


## Aggregate distance values

Using the Pandas pivot table functionality, aggregate the individual dissimilarity comparisons by group using `np.mean`.

In [88]:
combined_pivot = combined_df.pivot_table(values='Distance',
                        index=['Group1', 'Group2'],
                        columns=['Bacterial_Genus'],
                        aggfunc=np.mean)

In [89]:
combined_pivot.reset_index(inplace=True)

In [90]:
combined_pivot.head()


Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.952381,,0.4,0.952381,0.222222,0.844795,1.0,,...,1.0,0.626793,1.0,,0.222222,0.684271,,,,0.0
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,,,0.211348,,0.507755,0.944939,0.975,,...,0.638889,0.794204,0.779603,,0.507755,0.714956,,,,0.012658
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,1.0,,1.0,1.0,0.610979,0.966638,,,...,1.0,0.866128,0.878175,,0.610979,1.0,,,,1.0
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,1.0,,0.685714,0.985043,,,...,,0.893772,0.95,,0.685714,1.0,,,,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,1.0,,,1.0,,,...,,0.991501,1.0,,,1.0,,,,


In [91]:
set(combined_pivot['Group1'])

{'chimp_DRC_wild_campbell',
 'chimp_TZA_wild_moeller',
 'chimp_USA_captive_campbell',
 'chimp_USA_captive_moeller',
 'douc_SGP_captive_clayton',
 'douc_USA_captive_clayton',
 'douc_VNM_semicaptive_clayton',
 'douc_VNM_wild_clayton',
 'gorilla_DRC_wild_campbell',
 'gorilla_USA_captive_campbell',
 'howler_CRI_captive_clayton',
 'howler_CRI_wild_clayton',
 'human_USA'}

In [92]:
set(combined_pivot['Group2'])

{'chimp_DRC_wild_campbell',
 'chimp_TZA_wild_moeller',
 'chimp_USA_captive_campbell',
 'chimp_USA_captive_moeller',
 'douc_SGP_captive_clayton',
 'douc_USA_captive_clayton',
 'douc_VNM_semicaptive_clayton',
 'douc_VNM_wild_clayton',
 'gorilla_DRC_wild_campbell',
 'gorilla_USA_captive_campbell',
 'howler_CRI_captive_clayton',
 'howler_CRI_wild_clayton',
 'human_USA'}

In [93]:
combined_pivot

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.952381,,0.400000,0.952381,0.222222,0.844795,1.000,,...,1.000000,0.626793,1.000000,,0.222222,0.684271,,,,0.000000
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,,,0.211348,,0.507755,0.944939,0.975,,...,0.638889,0.794204,0.779603,,0.507755,0.714956,,,,0.012658
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,1.000000,,1.000000,1.000000,0.610979,0.966638,,,...,1.000000,0.866128,0.878175,,0.610979,1.000000,,,,1.000000
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,1.000000,,0.685714,0.985043,,,...,,0.893772,0.950000,,0.685714,1.000000,,,,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,1.000000,,,1.000000,,,...,,0.991501,1.000000,,,1.000000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,human_USA,gorilla_DRC_wild_campbell,,,,,1.000000,0.999695,,,...,,0.980359,1.000000,,1.000000,,,,,1.000000
165,human_USA,gorilla_USA_captive_campbell,,,,,1.000000,0.994426,,,...,,0.979166,0.984683,,1.000000,,0.532164,,,
166,human_USA,howler_CRI_captive_clayton,,,,,,1.000000,,,...,,0.999442,,,,,0.532164,,,
167,human_USA,howler_CRI_wild_clayton,,,,,1.000000,1.000000,,,...,,0.994831,,,1.000000,,0.970760,0.666667,,


## Load target group comparisons

This file should list just the specific group comparisons we want to look at.

In [94]:
target_grp_df = pd.read_csv('./data/host_specificity_score_populations_american_gut.txt', sep='\t')

In [95]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or captive to wild) as additional columns

In [96]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['wild_population']),
                       'comparison_dir'] = 'wild'

Remove non-target comparisons from the table

In [97]:
combined_pivot_filtered = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [98]:
combined_pivot_filtered

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
26,chimp_USA_captive_campbell,chimp_DRC_wild_campbell,1.0,,1.0,1.0,0.610979,0.966638,,,...,0.878175,,0.610979,1.0,,,,1.0,3,wild
38,chimp_USA_captive_campbell,human_USA,,,,,1.0,0.998805,,,...,0.980938,,1.0,,0.298246,,,1.0,3,human
40,chimp_USA_captive_moeller,chimp_TZA_wild_moeller,,,1.0,,0.513061,0.940518,,,...,0.921071,,0.513061,0.926907,0.666667,,0.210526,,2,wild
51,chimp_USA_captive_moeller,human_USA,,,,,1.0,0.998232,,,...,0.965,,1.0,,0.376218,,,,2,human
59,douc_SGP_captive_clayton,douc_VNM_wild_clayton,,,,,,0.966596,,,...,0.690848,,,,0.862233,,,,1,wild
64,douc_SGP_captive_clayton,human_USA,,,,,,0.998618,,,...,1.0,1.0,,,0.401949,,,,1,human
72,douc_USA_captive_clayton,douc_VNM_wild_clayton,,,,,1.0,1.0,,,...,1.0,,1.0,,0.897547,,,,0,wild
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.986176,,,...,1.0,,1.0,,0.364411,,,,0,human
85,douc_VNM_semicaptive_clayton,douc_VNM_wild_clayton,,,,,1.0,0.905775,,,...,0.730686,,1.0,,0.321829,,,,6,wild
90,douc_VNM_semicaptive_clayton,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,0.961014,,,,6,human


## Calculate HSS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [99]:
combined_pivot_filtered_t = combined_pivot_filtered.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose()

In [100]:
combined_pivot_filtered_t.head()

comparison_num,3,3,2,2,1,1,0,0,6,6,5,5,4,4
comparison_dir,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human,wild,human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Unassigned_____,1.0,,,,,,,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,1.0,,1.0,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,1.0,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,0.610979,1.0,0.513061,1.0,,,1.0,1.0,1.0,1.0,0.840678,1.0,,


In [101]:
target_grps = target_grp_df.copy()

In [102]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [103]:
HSS_df = pd.DataFrame(index=combined_pivot_filtered_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    HSS = combined_pivot_filtered_t.loc[:,(i,
                     'human')]/combined_pivot_filtered_t.loc[:,(i,
                                          'wild')]
    HSS_df[i] = HSS

In [104]:
HSS_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bacterial_Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.949084,1.636718,,1.189516,1.0


Make this pretty; add back in the comparison names

In [105]:
HSS_df_t = HSS_df.transpose()
HSS_df_t

Bacterial_Genus,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,k__Bacteria_p__Actinobacteria____,k__Bacteria_p__Actinobacteria_c__Acidimicrobiia_o__Acidimicrobiales_f___g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__
0,,,,,1.0,0.986176,,,,,...,,1.001087,1.0,,1.0,,0.406008,,,
1,,,,,,1.033128,,,,,...,,1.034226,1.447496,,,,0.466172,,,
2,,,,,1.949084,1.061364,,,,,...,,1.098157,1.047693,,1.949084,,0.564327,,,
3,,,,,1.636718,1.033277,,,,,...,,1.126103,1.11702,,1.636718,,,,,1.0
4,,,,,,1.197175,,,,,...,,1.072728,,,,,0.746411,,,
5,,,,,1.189516,1.002035,,,,,...,,1.152631,0.984683,,1.189516,,,,,
6,,,,,1.0,1.104027,,,,,...,,1.250603,1.368577,,1.0,,2.986096,,,


In [106]:
target_grp_df.columns

Index(['captive_population', 'human_population', 'wild_population'], dtype='object')

In [107]:
final_HSS_table = HSS_df_t.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [108]:
final_HSS_table.head()

captive_population,douc_USA_captive_clayton,douc_SGP_captive_clayton,chimp_USA_captive_moeller,chimp_USA_captive_campbell,howler_CRI_captive_clayton,gorilla_USA_captive_campbell,douc_VNM_semicaptive_clayton
human_population,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA
wild_population,douc_VNM_wild_clayton,douc_VNM_wild_clayton,chimp_TZA_wild_moeller,chimp_DRC_wild_campbell,howler_CRI_wild_clayton,gorilla_DRC_wild_campbell,douc_VNM_wild_clayton
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.949084,1.636718,,1.189516,1.0


Write to file

In [109]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_HSS_table.to_csv(join(outdir, 'hss-amgut.txt'), sep='\t')

## Calculate Microbiota Convergence Score

The MCS is the ratio of [captive vs human] / [wild vs human]

## Filter data list to target comparisons

Update the aggregated pivot table to include the comparison number and comparison direction (captive to human or human to wild) as additional columns

In [110]:
combined_pivot['comparison_num'] = 'None'
combined_pivot['comparison_dir'] = 'None'

In [111]:
combined_pivot.head()

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
0,chimp_DRC_wild_campbell,chimp_DRC_wild_campbell,0.952381,,0.4,0.952381,0.222222,0.844795,1.0,,...,1.0,,0.222222,0.684271,,,,0.0,,
1,chimp_DRC_wild_campbell,chimp_TZA_wild_moeller,,,0.211348,,0.507755,0.944939,0.975,,...,0.779603,,0.507755,0.714956,,,,0.012658,,
2,chimp_DRC_wild_campbell,chimp_USA_captive_campbell,1.0,,1.0,1.0,0.610979,0.966638,,,...,0.878175,,0.610979,1.0,,,,1.0,,
3,chimp_DRC_wild_campbell,chimp_USA_captive_moeller,,,1.0,,0.685714,0.985043,,,...,0.95,,0.685714,1.0,,,,,,
4,chimp_DRC_wild_campbell,douc_SGP_captive_clayton,,,1.0,,,1.0,,,...,1.0,,,1.0,,,,,,


In [112]:
target_grp_df

Unnamed: 0,captive_population,human_population,wild_population
0,douc_USA_captive_clayton,human_USA,douc_VNM_wild_clayton
1,douc_SGP_captive_clayton,human_USA,douc_VNM_wild_clayton
2,chimp_USA_captive_moeller,human_USA,chimp_TZA_wild_moeller
3,chimp_USA_captive_campbell,human_USA,chimp_DRC_wild_campbell
4,howler_CRI_captive_clayton,human_USA,howler_CRI_wild_clayton
5,gorilla_USA_captive_campbell,human_USA,gorilla_DRC_wild_campbell
6,douc_VNM_semicaptive_clayton,human_USA,douc_VNM_wild_clayton


In [113]:
for i, row in target_grp_df.iterrows():
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['captive_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'captive-human'
    combined_pivot.loc[(combined_pivot['Group1'] == row['wild_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_num'] = i
    combined_pivot.loc[(combined_pivot['Group1'] == row['wild_population']) &
                       (combined_pivot['Group2'] == row['human_population']),
                       'comparison_dir'] = 'wild-human'

In [114]:
combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,,,,1.0,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,,,1.0,0.996629,,1.0,...,0.979537,,1.0,,0.589912,1.0,,1.0,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,,,1.0,0.998805,,,...,0.980938,,1.0,,0.298246,,,1.0,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,,,1.0,0.998232,,,...,0.965,,1.0,,0.376218,,,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,,,,0.998618,,,...,1.0,1.0,,,0.401949,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.986176,,,...,1.0,,1.0,,0.364411,,,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,0.961014,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,1.0,...,0.992835,,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999695,,,...,1.0,,1.0,,,,,1.0,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,,,1.0,0.994426,,,...,0.984683,,1.0,,0.532164,,,,5,captive-human


Remove non-target comparisons from the table

In [115]:
combined_pivot_filtered_MCS = combined_pivot.loc[(combined_pivot['comparison_num'] != 'None'),]

In [116]:
combined_pivot_filtered_MCS

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,,,,1.0,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,,,1.0,0.996629,,1.0,...,0.979537,,1.0,,0.589912,1.0,,1.0,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,,,1.0,0.998805,,,...,0.980938,,1.0,,0.298246,,,1.0,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,,,1.0,0.998232,,,...,0.965,,1.0,,0.376218,,,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,,,,0.998618,,,...,1.0,1.0,,,0.401949,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.986176,,,...,1.0,,1.0,,0.364411,,,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,0.961014,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,1.0,...,0.992835,,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999695,,,...,1.0,,1.0,,,,,1.0,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,,,1.0,0.994426,,,...,0.984683,,1.0,,0.532164,,,,5,captive-human


## Calculate MCS

First, transpose and set column indexed on the combined filtered dataframe so that taxon is the row index and comparison number / comparison direction are a column MultiIndex

In [117]:
combined_pivot_filtered_MCS

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
12,chimp_DRC_wild_campbell,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,,,,1.0,3,wild-human
25,chimp_TZA_wild_moeller,human_USA,,,,,1.0,0.996629,,1.0,...,0.979537,,1.0,,0.589912,1.0,,1.0,2,wild-human
38,chimp_USA_captive_campbell,human_USA,,,,,1.0,0.998805,,,...,0.980938,,1.0,,0.298246,,,1.0,3,captive-human
51,chimp_USA_captive_moeller,human_USA,,,,,1.0,0.998232,,,...,0.965,,1.0,,0.376218,,,,2,captive-human
64,douc_SGP_captive_clayton,human_USA,,,,,,0.998618,,,...,1.0,1.0,,,0.401949,,,,1,captive-human
77,douc_USA_captive_clayton,human_USA,,,,,1.0,0.986176,,,...,1.0,,1.0,,0.364411,,,,0,captive-human
90,douc_VNM_semicaptive_clayton,human_USA,,,,,1.0,1.0,,,...,1.0,,1.0,,0.961014,,,,6,captive-human
103,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,1.0,...,0.992835,,1.0,,1.0,,,,6,wild-human
116,gorilla_DRC_wild_campbell,human_USA,,,,,1.0,0.999695,,,...,1.0,,1.0,,,,,1.0,5,wild-human
129,gorilla_USA_captive_campbell,human_USA,,,,,1.0,0.994426,,,...,0.984683,,1.0,,0.532164,,,,5,captive-human


Because we are re-using the Douc-Wild comparisons, we need to manually replicate those rows in the data frame.

In [118]:
douc_0_wild = combined_pivot_filtered_MCS.loc[
        (combined_pivot_filtered_MCS['comparison_num'] == 6) &
        (combined_pivot_filtered_MCS['comparison_dir'] == 'wild-human')].reset_index(drop=True)

In [119]:
douc_0_wild

Bacterial_Genus,Group1,Group2,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,...,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__,comparison_num,comparison_dir
0,douc_VNM_wild_clayton,human_USA,,,,,1.0,1.0,,1.0,...,0.992835,,1.0,,1.0,,,,6,wild-human


In [120]:
douc_0_wild.loc[douc_0_wild.index[0],
                'comparison_num'] = 0

In [121]:
douc_1_wild = combined_pivot_filtered_MCS.loc[
        (combined_pivot_filtered_MCS['comparison_num'] == 6) &
        (combined_pivot_filtered_MCS['comparison_dir'] == 'wild-human')].reset_index(drop=True)

In [122]:
douc_1_wild.loc[douc_0_wild.index[0],
                'comparison_num'] = 1

In [123]:
combined_pivot_filtered_MCS = combined_pivot_filtered_MCS.append([douc_0_wild, douc_1_wild]).reset_index(drop=True)

In [124]:
combined_pivot_filtered_MCS_t = combined_pivot_filtered_MCS.drop(columns=['Group1', 'Group2']).set_index(['comparison_num',
                                                                            'comparison_dir']).transpose().sort_index(axis=1)

In [125]:
combined_pivot_filtered_MCS_t.head()

comparison_num,0,0,1,1,2,2,3,3,4,4,5,5,6,6
comparison_dir,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human,captive-human,wild-human
Bacterial_Genus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Unassigned_____,,,,,,,,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0


In [126]:
target_grps = target_grp_df.copy()

In [127]:
target_grps.drop(columns=['human_population', 'wild_population'], 
                inplace=True)

In [128]:
target_grps

Unnamed: 0,captive_population
0,douc_USA_captive_clayton
1,douc_SGP_captive_clayton
2,chimp_USA_captive_moeller
3,chimp_USA_captive_campbell
4,howler_CRI_captive_clayton
5,gorilla_USA_captive_campbell
6,douc_VNM_semicaptive_clayton


In [129]:
MCS_df = pd.DataFrame(index=combined_pivot_filtered_MCS_t.index, 
                      columns=target_grps.index)

for i, row in target_grps.iterrows():
    print(row)
    print(i)
    MCS = combined_pivot_filtered_MCS_t.loc[:,(i,
                     'captive-human')]/combined_pivot_filtered_MCS_t.loc[:,(i,
                                          'wild-human')]
    MCS_df[i] = MCS

captive_population    douc_USA_captive_clayton
Name: 0, dtype: object
0
captive_population    douc_SGP_captive_clayton
Name: 1, dtype: object
1
captive_population    chimp_USA_captive_moeller
Name: 2, dtype: object
2
captive_population    chimp_USA_captive_campbell
Name: 3, dtype: object
3
captive_population    howler_CRI_captive_clayton
Name: 4, dtype: object
4
captive_population    gorilla_USA_captive_campbell
Name: 5, dtype: object
5
captive_population    douc_VNM_semicaptive_clayton
Name: 6, dtype: object
6


In [130]:
MCS_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6
Bacterial_Genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.0,1.0,,1.0,1.0


Make this pretty; add back in the comparison names

In [131]:
MCS_df_t = MCS_df.transpose()
MCS_df_t

Bacterial_Genus,Unassigned_____,k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,k__Bacteria_____,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f___g__,k__Bacteria_p__Acidobacteria_c__Acidobacteria-6_o__iii1-15_f__mb2424_g__,k__Bacteria_p__Actinobacteria____,k__Bacteria_p__Actinobacteria_c__Acidimicrobiia_o__Acidimicrobiales_f___g__,...,k__Bacteria_p__Tenericutes_c__Mollicutes_o__Mycoplasmatales_f__Mycoplasmataceae_g__,k__Bacteria_p__Tenericutes_c__Mollicutes_o__RF39_f___g__,k__Bacteria_p__Tenericutes_c__RF3_o__ML615J-28_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__HA64_f___g__,k__Bacteria_p__Verrucomicrobia_c__Opitutae_o__[Cerasicoccales]_f__[Cerasicoccaceae]_g__,k__Bacteria_p__Verrucomicrobia_c__Verruco-5_o__WCHB1-41_f__RFP12_g__,k__Bacteria_p__Verrucomicrobia_c__Verrucomicrobiae_o__Verrucomicrobiales_f__Verrucomicrobiaceae_g__Akkermansia,k__Bacteria_p__Verrucomicrobia_c__[Spartobacteria]_o__[Chthoniobacterales]_f__[Chthoniobacteraceae]_g__DA101,k__Bacteria_p__WPS-2_c___o___f___g__,k__Bacteria_p___c___o___f___g__
0,,,,,1.0,0.986176,,,,,...,,1.006445,1.007217,,1.0,,0.364411,,,
1,,,,,,0.998618,,,,,...,,1.002555,1.007217,,,,0.401949,,,
2,,,,,1.0,1.001608,,,,,...,,1.006581,0.98516,,1.0,,0.637753,,,
3,,,,,1.0,0.998805,,,,,...,,1.013626,0.980938,,1.0,,,,,1.0
4,,,,,,1.0,,,,,...,,1.004635,,,,,0.548193,,,
5,,,,,1.0,0.994729,,,,,...,,0.998782,0.984683,,1.0,,,,,
6,,,,,1.0,1.0,,,,,...,,1.002335,1.007217,,1.0,,0.961014,,,


In [132]:
target_grp_df.columns

Index(['captive_population', 'human_population', 'wild_population'], dtype='object')

In [133]:
final_MCS_table = MCS_df_t.merge(target_grp_df,
                      left_index=True,
                      right_index=True).set_index(list(target_grp_df.columns)).transpose()

In [134]:
final_MCS_table.head()

captive_population,douc_USA_captive_clayton,douc_SGP_captive_clayton,chimp_USA_captive_moeller,chimp_USA_captive_campbell,howler_CRI_captive_clayton,gorilla_USA_captive_campbell,douc_VNM_semicaptive_clayton
human_population,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA,human_USA
wild_population,douc_VNM_wild_clayton,douc_VNM_wild_clayton,chimp_TZA_wild_moeller,chimp_DRC_wild_campbell,howler_CRI_wild_clayton,gorilla_DRC_wild_campbell,douc_VNM_wild_clayton
Unassigned_____,,,,,,,
k__Archaea_p__Crenarchaeota_c__Thaumarchaeota_o__Nitrososphaerales_f__Nitrososphaeraceae_g__CandidatusNitrososphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanobrevibacter,,,,,,,
k__Archaea_p__Euryarchaeota_c__Methanobacteria_o__Methanobacteriales_f__Methanobacteriaceae_g__Methanosphaera,,,,,,,
k__Archaea_p__Euryarchaeota_c__Thermoplasmata_o__E2_f__[Methanomassiliicoccaceae]_g__vadinCA11,1.0,,1.0,1.0,,1.0,1.0


Write to file

In [135]:
outdir = join('output', 'results')
makedirs(outdir, exist_ok=True)
final_MCS_table.to_csv(join(outdir, 'mcs-amgut.txt'), sep='\t')