In [1]:
import pandas as pd
import numpy as np
from src.util import clr

In [2]:
# Read data files
microbes = pd.read_csv('./data/microbes.from_biom.tsv', sep='\t', index_col=0).drop(columns="9hr_late")
metabolites = pd.read_csv('./data/metabolites.from_biom.tsv', sep='\t', index_col=0)
microbes = microbes.reindex(columns = metabolites.columns)

In [3]:
# Print the smallest non-zero values in each dataframe
print(microbes.values[microbes.values != 0].min())
print(metabolites.values[metabolites.values != 0].min())

9.0
1.0


In [4]:
# scale dataframes by dividing by the mean value
microbes_sc = microbes/microbes.values.mean()
metabolites_sc = metabolites/metabolites.values.mean()

In [5]:
# replace zeros with the smallest non-zero value in the dataframe
microbes_min = microbes.replace(0, microbes.values[microbes.values != 0].min())
metabolites_min = metabolites.replace(0, metabolites.values[metabolites.values != 0].min())

microbes_sc_min = microbes_sc.replace(0, microbes_sc.values[microbes_sc.values != 0].min())
metabolites_sc_min = metabolites_sc.replace(0, metabolites_sc.values[metabolites_sc.values != 0].min())

In [6]:
# replace all zero's with epsilon value
microbes.replace(0, np.nextafter(0, 1), inplace=True)
metabolites.replace(0, np.nextafter(0, 1), inplace=True)
microbes_sc.replace(0, np.nextafter(0, 1), inplace=True)
metabolites_sc.replace(0, np.nextafter(0, 1), inplace=True)

In [7]:
# concatenate dataframes and apply clr transformation
concat = pd.concat([microbes, metabolites])
concat_min = pd.concat([microbes_min, metabolites_min])
concat_sc = pd.concat([microbes_sc, metabolites_sc])
concat_sc_min = pd.concat([microbes_sc_min, metabolites_sc_min])

clr_concat = clr(concat)
clr_min = clr(concat_min)
clr_sc = clr(concat_sc)
clr_sc_min = clr(concat_sc_min)

In [8]:
# separate microbes and metabolites from transformed dataframe
clr_microbes = clr_concat.loc[microbes.index]
clr_metabolites = clr_concat.loc[metabolites.index]
clr_microbes_min = clr_min.loc[microbes_min.index]
clr_metabolites_min = clr_min.loc[metabolites_min.index]
clr_microbes_sc = clr_sc.loc[microbes_sc.index]
clr_metabolites_sc = clr_sc.loc[metabolites_sc.index]
clr_microbes_sc_min = clr_sc_min.loc[microbes_sc_min.index]
clr_metabolites_sc_min = clr_sc_min.loc[metabolites_sc_min.index]

In [9]:
# save to file
clr_microbes.to_csv('./data/clr_microbes.tsv', sep='\t', index=True)
clr_metabolites.to_csv('./data/clr_metabolites.tsv', sep='\t', index=True)
clr_microbes_min.to_csv('./data/clr_microbes_min.tsv', sep='\t', index=True)
clr_metabolites_min.to_csv('./data/clr_metabolites_min.tsv', sep='\t', index=True)
clr_microbes_sc.to_csv('./data/clr_microbes_sc.tsv', sep='\t', index=True)
clr_metabolites_sc.to_csv('./data/clr_metabolites_sc.tsv', sep='\t', index=True)
clr_microbes_sc_min.to_csv('./data/clr_microbes_sc_min.tsv', sep='\t', index=True)
clr_metabolites_sc_min.to_csv('./data/clr_metabolites_sc_min.tsv', sep='\t', index=True)