# Run RPCA with Gemelli

-  Run datasets individually (no shared ASVs between datasets)

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob
import os

#current directory
print('current directory:',os.getcwd())

#For illustrator import:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

results_directory = '../RPCA/'

current directory: /Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/scripts


### File Locations

In [4]:
prefix = 'GLOMICON'

#Data Directory
directory = '../data/'

#Directory for saving Figures
plot_dir = '../figures/RPCA/'
print(plot_dir)

../figures/RPCA/


### Functions

In [17]:
# Dada2 Banzai Output Functions
levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

def make_metadata(infile):
    df = pd.read_csv(infile,index_col=0)
    df.index.names = ['sample_name']
    #df['date'] = pd.to_datetime(df['date'])
    print('Number samples:', len(df.index))
    return df

def make_taxa_asv(infile):
    df = pd.read_csv(infile)
    #df = df.drop('Unnamed: 0', axis=0)
    #df = df.rename(columns= {'Unnamed: 0':'ASV'})
    #df.set_index('ASV', inplace=True)
    print('Number ASVs:', len(df.index))
    return df

#from metadata file, limit OTU table and taxa table to those present in those samples
def from_metadata_to_taxareads(meta_data, otu_table, taxa_table):
    #standard M6 output; sample_names as index; OTUs as index
    cols = list(meta_data)
    otu_lim = pd.concat([meta_data, otu_table.T],join='inner', axis=1)
    otu_lim.drop(cols, inplace=True, axis=1)
    otu_lim=otu_lim.T
    otu_lim['Total']=otu_lim.sum(axis=1)
    otu_lim = otu_lim.loc[otu_lim['Total']>0]
    otu_lim.drop('Total', axis=1, inplace=True)
    cols=list(otu_lim)
    taxa_lim=pd.concat([otu_lim, taxa_table], axis=1, join='inner')
    taxa_lim.drop(cols, inplace=True, axis=1)
    return otu_lim, taxa_lim

# AWI

In [65]:
Analizing_Institute = 'AWI'

## Load Data

In [66]:
marker = '18S'

print('#####' + marker + '#####')

# otu table
file = prefix +'_asv_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
otu_all = df.copy()

# taxa table
file = prefix +'_taxa_merged_updated.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df = df.drop('Unnamed: 0', axis=1)
df.set_index('ASV', inplace=True)
taxa_all = df.copy()

# metadata
file = prefix +'_meta_merged.csv'
print(directory+file)
df = make_metadata(directory+file)
meta_all = df.copy()

# sequence table
file = prefix +'_seq_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
seq_all = df.copy()

seq_all.head()

#####18S#####
../data/GLOMICON_asv_merged.csv
Number ASVs: 11009
../data/GLOMICON_taxa_merged_updated.csv
Number ASVs: 11009
../data/GLOMICON_meta_merged.csv
Number samples: 117
../data/GLOMICON_seq_merged.csv
Number ASVs: 11009


Unnamed: 0_level_0,sequence,Analyzing_Institute
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1
ASV_1,TAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATT...,AWI
ASV_2,TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGGATT...,AWI
ASV_3,TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATT...,AWI
ASV_4,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI
ASV_5,TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAATT...,AWI


In [67]:
print(list(taxa_all))
taxa_all.head()

['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ASV_1,Viridiplantae,Chlorophyta,Mamiellophyceae,Mamiellales,Mamiellaceae,Micromonas,Micromonas polaris
ASV_1000,,,Bigyra,Thraustochytrida,Thraustochytriaceae,Aplanochytrium,
ASV_10000,,Haptophyta,,Phaeocystales,Phaeocystaceae,Phaeocystis,Phaeocystis pouchetii
ASV_10001,,Haptophyta,,Isochrysidales,Isochrysidaceae,Isochrysis,
ASV_10002,,Bacillariophyta,Coscinodiscophyceae,Thalassiosirales,Thalassiosiraceae,Thalassiosira,


In [68]:
meta_all.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Davenport_02_0008,AWI,MBARI
Davenport_06_0008,AWI,MBARI
Davenport_11_0008,AWI,MBARI
Davenport_15_0008,AWI,MBARI
Davenport_19_0008,AWI,MBARI


In [69]:
otu_all.head()

Unnamed: 0_level_0,Davenport_02_0008,Davenport_06_0008,Davenport_11_0008,Davenport_15_0008,Davenport_19_0008,Framstrait_01_0008,Framstrait_05_0008,Framstrait_09_0008,Framstrait_13_0008,Framstrait_17_0008,...,UDalhousie2_AO,UDalhousie8_AO,UDalhousie14_AO,UDalhousie20_AO,UDalhousie26_AO,AWIMOCKEVEN1_AO,AWIMOCKEVEN2_AO,AWIMOCKEVEN3_AO,AWIMOCKEVEN4_AO,AWIMOCKEVEN5_AO
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ASV_1,0.0,0.0,5.0,7.0,16.0,8611.0,7354.0,2066.0,4845.0,4061.0,...,,,,,,,,,,
ASV_2,9.0,4.0,5.0,12.0,13.0,1277.0,1243.0,182.0,327.0,397.0,...,,,,,,,,,,
ASV_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
ASV_4,,,,,,,,,,,...,12.0,16.0,4.0,1.0,10.0,8401.0,9365.0,8999.0,8516.0,9445.0
ASV_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Limit data by metadata parameters

- Run each Analyzing Institute separately

In [70]:
df = meta_all.copy()
#print(df['depth'].max())
df = df.loc[df['Analyzing_Institute'] == Analizing_Institute]
meta_lim = df.copy()
df.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Davenport_02_0008,AWI,MBARI
Davenport_06_0008,AWI,MBARI
Davenport_11_0008,AWI,MBARI
Davenport_15_0008,AWI,MBARI
Davenport_19_0008,AWI,MBARI


In [71]:
otu_lim, taxa_lim = from_metadata_to_taxareads(meta_lim, otu_all, taxa_all)

## Create Biom File

In [72]:
# create limited biom file

# asv table
filename = results_directory + "Qiime2_asv.tsv"
#check filename
print(filename)

df = otu_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# taxa table
filename = results_directory + "Qiime2_taxa.tsv"
#check filename
print(filename)

df = taxa_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# metadata table
filename = results_directory + "Qiime2_meta.tsv"
#check filename
print(filename)
df = meta_lim.copy()
df.index.names = ['#SampleID']
df.to_csv(filename,sep='\t')


../RPCA/Qiime2_asv.tsv
../RPCA/Qiime2_taxa.tsv
../RPCA/Qiime2_meta.tsv


## Make BIOM file

 - biom commands in python seem buggy - run in bash for now.
 - EDIT FILE PATH BELOW TO BE CORRECT DIRECTORY

In [73]:
%%bash
cd /Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/
pwd
# run in correct conda environment (gemelli)
#Make biom file
conda run -n gemelli biom convert -i Qiime2_asv.tsv -o table.from_txt_json.biom --table-type="OTU table" --to-json
#add metadata files to biom file - change the merged_tax_table_for_biomm.txt and _merged_for_biom.txt files
conda run -n gemelli biom add-metadata -i table.from_txt_json.biom -o table.w_md.biom --observation-metadata-fp Qiime2_taxa.tsv --sample-metadata-fp Qiime2_meta.tsv

/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA


## Run RPCA

In [74]:
from biom import load_table
from gemelli.rpca import rpca
# import the data table
table = load_table('/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/table.w_md.biom')
# perform RPCA
ordination, distance = rpca(table, min_sample_count=500)


  mat = np.log(matrix_closure(mat))


In [75]:
ordination.proportion_explained

PC1    0.407619
PC2    0.348749
PC3    0.243632
dtype: float64

In [76]:
# save whole ordination object

file = results_directory + Analizing_Institute + '_ordination'
ordination.write(file = file, format = "ordination")

# export proportion explained
file = results_directory + Analizing_Institute + '_prop_explained.csv'
print(file)
ordination.proportion_explained.to_csv(file)

# export scores
file = results_directory + Analizing_Institute + '_scores.csv'
print(file)
scores = pd.concat([ordination.samples, meta_lim], axis=1)
scores.to_csv(file)

# export loadings
file = results_directory + Analizing_Institute + '_loadings.csv'
print(file)
loadings = pd.concat([ordination.features, taxa_lim], axis=1)
loadings.to_csv(file)

# distance matrix:
file = results_directory + Analizing_Institute + '_distance.csv'
print(file)
df_distance = pd.DataFrame(distance.data, distance.ids, distance.ids)
df_distance.to_csv(file)
df_distance.head()

../RPCA/AWI_prop_explained.csv
../RPCA/AWI_scores.csv
../RPCA/AWI_loadings.csv
../RPCA/AWI_distance.csv


Unnamed: 0,Davenport_02_0008,Davenport_06_0008,Davenport_11_0008,Davenport_15_0008,Davenport_19_0008,Framstrait_01_0008,Framstrait_05_0008,Framstrait_09_0008,Framstrait_13_0008,Framstrait_17_0008,...,LaJolla_03_0049,LaJolla_09_0049,LaJolla_15_0049,LaJolla_21_0049,LaJolla_27_0049,Plymouth_06_0049,Plymouth_13_0049,Plymouth_19_0049,Plymouth_29_0049,Plymouth_30_0049
Davenport_02_0008,0.0,0.074066,0.201756,0.233738,0.319628,2.587661,2.521566,2.17735,2.513072,2.399786,...,2.547661,2.623944,2.316016,2.486269,2.659401,2.671696,2.713881,2.770385,2.629678,2.602916
Davenport_06_0008,0.074066,0.0,0.135293,0.223964,0.28524,2.614077,2.547499,2.200057,2.540587,2.424866,...,2.481113,2.555778,2.250354,2.419492,2.590863,2.638788,2.680074,2.734441,2.59391,2.566154
Davenport_11_0008,0.201756,0.135293,0.0,0.222149,0.197246,2.720731,2.652735,2.301294,2.647615,2.528942,...,2.409063,2.478066,2.182042,2.347202,2.512873,2.656434,2.694724,2.745427,2.605656,2.57702
Davenport_15_0008,0.233738,0.223964,0.222149,0.0,0.187724,2.795007,2.732682,2.391028,2.724007,2.612131,...,2.58903,2.653457,2.363129,2.525466,2.683308,2.806367,2.853837,2.900625,2.759628,2.719775
Davenport_19_0008,0.319628,0.28524,0.197246,0.187724,0.0,2.89853,2.831054,2.482574,2.824106,2.708051,...,2.518505,2.578424,2.297522,2.456195,2.611521,2.841079,2.879706,2.927288,2.787793,2.755372


### Make New Folder and move new files there:

- be careful that naming structure works because files are being deleted from one place and moved to another

In [77]:
#current directory where files are located:
print(results_directory)

'''# datetime object containing current date and time
now = datetime.now()
print("now =", now)
dt_string = now.strftime("%Y%m%d")
print("date and time =", dt_string)'''

new_dir = results_directory + Analizing_Institute + '/'
#New directory files will be moved to:
print(new_dir)

../RPCA/
../RPCA/AWI/


In [78]:
if not os.path.exists(new_dir):
    os.makedirs(new_dir)
search = results_directory+'*.csv'
print(search)
# Get list of files present in current directory (to move):
files = glob.glob(search)
files = glob.glob(results_directory+'*.csv') + glob.glob(results_directory+'*.tsv') + glob.glob(results_directory+'*.biom')+ glob.glob(results_directory+'*ordination')
print(files)
print('Moving files to subdirectory:')
for i in files:
    file = i
    new_file = i.replace(results_directory, new_dir)
    print(file)
    print(new_file)
    os.rename(file, new_file)

../RPCA/*.csv
['../RPCA/AWI_scores.csv', '../RPCA/AWI_prop_explained.csv', '../RPCA/AWI_distance.csv', '../RPCA/AWI_loadings.csv', '../RPCA/Qiime2_meta.tsv', '../RPCA/Qiime2_asv.tsv', '../RPCA/Qiime2_taxa.tsv', '../RPCA/table.from_txt_json.biom', '../RPCA/table.w_md.biom', '../RPCA/AWI_ordination']
Moving files to subdirectory:
../RPCA/AWI_scores.csv
../RPCA/AWI/AWI_scores.csv
../RPCA/AWI_prop_explained.csv
../RPCA/AWI/AWI_prop_explained.csv
../RPCA/AWI_distance.csv
../RPCA/AWI/AWI_distance.csv
../RPCA/AWI_loadings.csv
../RPCA/AWI/AWI_loadings.csv
../RPCA/Qiime2_meta.tsv
../RPCA/AWI/Qiime2_meta.tsv
../RPCA/Qiime2_asv.tsv
../RPCA/AWI/Qiime2_asv.tsv
../RPCA/Qiime2_taxa.tsv
../RPCA/AWI/Qiime2_taxa.tsv
../RPCA/table.from_txt_json.biom
../RPCA/AWI/table.from_txt_json.biom
../RPCA/table.w_md.biom
../RPCA/AWI/table.w_md.biom
../RPCA/AWI_ordination
../RPCA/AWI/AWI_ordination


# SBR

In [54]:
Analizing_Institute = 'SBR'

## Limit data by metadata parameters

- Run each Analyzing Institute separately

In [55]:
df = meta_all.copy()
#print(df['depth'].max())
df = df.loc[df['Analyzing_Institute'] == Analizing_Institute]
meta_lim = df.copy()
df.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
cj-BMk13,SBR,BLOOMMOCK
cj-BMk15,SBR,BLOOMMOCK
cj-BMk17,SBR,BLOOMMOCK
cj-BMk21,SBR,BLOOMMOCK
cj-BMk25,SBR,BLOOMMOCK


In [56]:
otu_lim, taxa_lim = from_metadata_to_taxareads(meta_lim, otu_all, taxa_all)

## Create Biom File

In [57]:
# create limited biom file

# asv table
filename = results_directory + "Qiime2_asv.tsv"
#check filename
print(filename)

df = otu_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# taxa table
filename = results_directory + "Qiime2_taxa.tsv"
#check filename
print(filename)

df = taxa_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# metadata table
filename = results_directory + "Qiime2_meta.tsv"
#check filename
print(filename)
df = meta_lim.copy()
df.index.names = ['#SampleID']
df.to_csv(filename,sep='\t')


../RPCA/Qiime2_asv.tsv
../RPCA/Qiime2_taxa.tsv
../RPCA/Qiime2_meta.tsv


## Make BIOM file

 - biom commands in python seem buggy - run in bash for now.
 - EDIT FILE PATH BELOW TO BE CORRECT DIRECTORY

In [58]:
%%bash
cd /Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/
pwd
# run in correct conda environment (gemelli)
#Make biom file
conda run -n gemelli biom convert -i Qiime2_asv.tsv -o table.from_txt_json.biom --table-type="OTU table" --to-json
#add metadata files to biom file - change the merged_tax_table_for_biomm.txt and _merged_for_biom.txt files
conda run -n gemelli biom add-metadata -i table.from_txt_json.biom -o table.w_md.biom --observation-metadata-fp Qiime2_taxa.tsv --sample-metadata-fp Qiime2_meta.tsv

/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA


## Run RPCA

In [59]:
from biom import load_table
from gemelli.rpca import rpca
# import the data table
table = load_table('/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/table.w_md.biom')
# perform RPCA
ordination, distance = rpca(table, min_sample_count=500)


  mat = np.log(matrix_closure(mat))


In [60]:
ordination.proportion_explained

PC1    0.518699
PC2    0.459227
PC3    0.022074
dtype: float64

In [61]:
# save whole ordination object

file = results_directory + Analizing_Institute + '_ordination'
ordination.write(file = file, format = "ordination")

# export proportion explained
file = results_directory + Analizing_Institute + '_prop_explained.csv'
print(file)
ordination.proportion_explained.to_csv(file)

# export scores
file = results_directory + Analizing_Institute + '_scores.csv'
print(file)
scores = pd.concat([ordination.samples, meta_lim], axis=1)
scores.to_csv(file)

# export loadings
file = results_directory + Analizing_Institute + '_loadings.csv'
print(file)
loadings = pd.concat([ordination.features, taxa_lim], axis=1)
loadings.to_csv(file)

# distance matrix:
file = results_directory + Analizing_Institute + '_distance.csv'
print(file)
df_distance = pd.DataFrame(distance.data, distance.ids, distance.ids)
df_distance.to_csv(file)
df_distance.head()

../RPCA/SBR_prop_explained.csv
../RPCA/SBR_scores.csv
../RPCA/SBR_loadings.csv
../RPCA/SBR_distance.csv


Unnamed: 0,cj-BMk13,cj-BMk15,cj-BMk17,cj-BMk21,cj-BMk25,cj-DAL03,cj-DAL09,cj-DAL15,cj-DAL21,cj-DAL27,...,cj-ROS01,cj-ROS05,cj-ROS09,cj-ROS13,cj-ROS17,cj-SOC05,cj-SOC12,cj-SOC18,cj-SOC24,cj-SOC28
cj-BMk13,0.0,0.038055,0.057564,0.14212,0.533954,2.572433,2.769625,2.721308,3.106252,2.75292,...,2.855087,2.814455,2.753138,2.67316,2.554035,2.859283,2.845543,2.787885,2.801117,2.801869
cj-BMk15,0.038055,0.0,0.027852,0.136638,0.539617,2.573147,2.767094,2.719168,3.101028,2.756141,...,2.875993,2.836208,2.773903,2.691821,2.573674,2.871172,2.854315,2.800723,2.812195,2.815037
cj-BMk17,0.057564,0.027852,0.0,0.116332,0.533802,2.552465,2.743255,2.695829,3.075962,2.737642,...,2.874417,2.835595,2.772832,2.689835,2.571933,2.858228,2.838595,2.789944,2.797595,2.804405
cj-BMk21,0.14212,0.136638,0.116332,0.0,0.448756,2.436689,2.633093,2.583909,2.969835,2.621461,...,2.785563,2.749418,2.685553,2.600622,2.483117,2.745958,2.723538,2.68088,2.683734,2.695544
cj-BMk25,0.533954,0.539617,0.533802,0.448756,0.0,2.188155,2.485443,2.412796,2.854621,2.327964,...,2.391991,2.35806,2.287692,2.190344,2.080076,2.410075,2.415828,2.314385,2.377916,2.329939


### Make New Folder and move new files there:

- be careful that naming structure works because files are being deleted from one place and moved to another

In [62]:
#current directory where files are located:
print(results_directory)

'''# datetime object containing current date and time
now = datetime.now()
print("now =", now)
dt_string = now.strftime("%Y%m%d")
print("date and time =", dt_string)'''

new_dir = results_directory + Analizing_Institute + '/'
#New directory files will be moved to:
print(new_dir)

../RPCA/
../RPCA/SBR/


In [63]:
if not os.path.exists(new_dir):
    os.makedirs(new_dir)
search = results_directory+'*.csv'
print(search)
# Get list of files present in current directory (to move):
files = glob.glob(search)
files = glob.glob(results_directory+'*.csv') + glob.glob(results_directory+'*.tsv') + glob.glob(results_directory+'*.biom')+ glob.glob(results_directory+'*ordination')
print(files)
print('Moving files to subdirectory:')
for i in files:
    file = i
    new_file = i.replace(results_directory, new_dir)
    print(file)
    print(new_file)
    os.rename(file, new_file)

../RPCA/*.csv
['../RPCA/18S_distance.csv', '../RPCA/18S_scores.csv', '../RPCA/SBR_scores.csv', '../RPCA/18S_loadings.csv', '../RPCA/SBR_prop_explained.csv', '../RPCA/SBR_distance.csv', '../RPCA/18S_prop_explained.csv', '../RPCA/SBR_loadings.csv', '../RPCA/Qiime2_meta.tsv', '../RPCA/Qiime2_asv.tsv', '../RPCA/Qiime2_taxa.tsv', '../RPCA/table.from_txt_json.biom', '../RPCA/table.w_md.biom', '../RPCA/18S_ordination', '../RPCA/SBR_ordination']
Moving files to subdirectory:
../RPCA/18S_distance.csv
../RPCA/SBR/18S_distance.csv
../RPCA/18S_scores.csv
../RPCA/SBR/18S_scores.csv
../RPCA/SBR_scores.csv
../RPCA/SBR/SBR_scores.csv
../RPCA/18S_loadings.csv
../RPCA/SBR/18S_loadings.csv
../RPCA/SBR_prop_explained.csv
../RPCA/SBR/SBR_prop_explained.csv
../RPCA/SBR_distance.csv
../RPCA/SBR/SBR_distance.csv
../RPCA/18S_prop_explained.csv
../RPCA/SBR/18S_prop_explained.csv
../RPCA/SBR_loadings.csv
../RPCA/SBR/SBR_loadings.csv
../RPCA/Qiime2_meta.tsv
../RPCA/SBR/Qiime2_meta.tsv
../RPCA/Qiime2_asv.tsv
../RP

# MBARI

In [79]:
Analizing_Institute = 'MBARI'

## Limit data by metadata parameters

- Run each Analyzing Institute separately

In [80]:
df = meta_all.copy()
#print(df['depth'].max())
df = df.loc[df['Analyzing_Institute'] == Analizing_Institute]
meta_lim = df.copy()
df.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
CN18Sc37_12_Rep_Stdy5_AO,MBARI,MBARI
CN18Sc37_12_Rep_Stdy10_AO,MBARI,MBARI
CN18Sc37_12_Rep_Stdy14_AO,MBARI,MBARI
CN18Sc37_12_Rep_Stdy18_AO,MBARI,MBARI
CN18Sc37_12_Rep_Stdy22_AO,MBARI,MBARI


In [81]:
otu_lim, taxa_lim = from_metadata_to_taxareads(meta_lim, otu_all, taxa_all)

## Create Biom File

In [82]:
# create limited biom file

# asv table
filename = results_directory + "Qiime2_asv.tsv"
#check filename
print(filename)

df = otu_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# taxa table
filename = results_directory + "Qiime2_taxa.tsv"
#check filename
print(filename)

df = taxa_lim.copy()
df.index.names = ['#OTUID']
df.to_csv(filename,sep='\t')

# metadata table
filename = results_directory + "Qiime2_meta.tsv"
#check filename
print(filename)
df = meta_lim.copy()
df.index.names = ['#SampleID']
df.to_csv(filename,sep='\t')


../RPCA/Qiime2_asv.tsv
../RPCA/Qiime2_taxa.tsv
../RPCA/Qiime2_meta.tsv


## Make BIOM file

 - biom commands in python seem buggy - run in bash for now.
 - EDIT FILE PATH BELOW TO BE CORRECT DIRECTORY

In [83]:
%%bash
cd /Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/
pwd
# run in correct conda environment (gemelli)
#Make biom file
conda run -n gemelli biom convert -i Qiime2_asv.tsv -o table.from_txt_json.biom --table-type="OTU table" --to-json
#add metadata files to biom file - change the merged_tax_table_for_biomm.txt and _merged_for_biom.txt files
conda run -n gemelli biom add-metadata -i table.from_txt_json.biom -o table.w_md.biom --observation-metadata-fp Qiime2_taxa.tsv --sample-metadata-fp Qiime2_meta.tsv

/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA


## Run RPCA

In [84]:
from biom import load_table
from gemelli.rpca import rpca
# import the data table
table = load_table('/Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/RPCA/table.w_md.biom')
# perform RPCA
ordination, distance = rpca(table, min_sample_count=500)


  mat = np.log(matrix_closure(mat))


In [85]:
ordination.proportion_explained

PC1    0.589025
PC2    0.400284
PC3    0.010691
dtype: float64

In [86]:
# save whole ordination object

file = results_directory + Analizing_Institute + '_ordination'
ordination.write(file = file, format = "ordination")

# export proportion explained
file = results_directory + Analizing_Institute + '_prop_explained.csv'
print(file)
ordination.proportion_explained.to_csv(file)

# export scores
file = results_directory + Analizing_Institute + '_scores.csv'
print(file)
scores = pd.concat([ordination.samples, meta_lim], axis=1)
scores.to_csv(file)

# export loadings
file = results_directory + Analizing_Institute + '_loadings.csv'
print(file)
loadings = pd.concat([ordination.features, taxa_lim], axis=1)
loadings.to_csv(file)

# distance matrix:
file = results_directory + Analizing_Institute + '_distance.csv'
print(file)
df_distance = pd.DataFrame(distance.data, distance.ids, distance.ids)
df_distance.to_csv(file)
df_distance.head()

../RPCA/MBARI_prop_explained.csv
../RPCA/MBARI_scores.csv
../RPCA/MBARI_loadings.csv
../RPCA/MBARI_distance.csv


Unnamed: 0,CN18Sc37_12_Rep_Stdy5_AO,CN18Sc37_12_Rep_Stdy10_AO,CN18Sc37_12_Rep_Stdy14_AO,CN18Sc37_12_Rep_Stdy18_AO,CN18Sc37_12_Rep_Stdy22_AO,NOAA11_AO,NOAA14_AO,NOAA20_AO,AWIMOCKBLOOM7_AO,AWIMOCKBLOOM8_AO,...,UDalhousie2_AO,UDalhousie8_AO,UDalhousie14_AO,UDalhousie20_AO,UDalhousie26_AO,AWIMOCKEVEN1_AO,AWIMOCKEVEN2_AO,AWIMOCKEVEN3_AO,AWIMOCKEVEN4_AO,AWIMOCKEVEN5_AO
CN18Sc37_12_Rep_Stdy5_AO,0.0,0.695501,0.659638,0.8016,0.259222,0.569901,0.56665,0.72656,1.870212,1.7564,...,2.381681,2.409692,2.376783,2.319352,2.377438,1.823708,1.850391,1.849511,1.871097,1.921426
CN18Sc37_12_Rep_Stdy10_AO,0.695501,0.0,0.151639,0.181519,0.558328,1.040631,1.033904,1.13686,1.276452,1.176599,...,2.35921,2.393699,2.376435,2.325814,2.341247,1.224101,1.257966,1.25364,1.26945,1.292919
CN18Sc37_12_Rep_Stdy14_AO,0.659638,0.151639,0.0,0.143919,0.475305,1.066843,1.058261,1.172112,1.235679,1.129225,...,2.395469,2.431163,2.411459,2.357435,2.380867,1.186261,1.216929,1.213747,1.232636,1.272334
CN18Sc37_12_Rep_Stdy18_AO,0.8016,0.181519,0.143919,0.0,0.616642,1.183796,1.17461,1.278786,1.109928,1.009888,...,2.435025,2.471758,2.455217,2.402858,2.417958,1.060731,1.093489,1.087858,1.105401,1.140646
CN18Sc37_12_Rep_Stdy22_AO,0.259222,0.558328,0.475305,0.616642,0.0,0.801385,0.793877,0.943288,1.636041,1.518841,...,2.392696,2.42481,2.394057,2.334154,2.38819,1.590666,1.615066,1.615789,1.638699,1.696887


### Make New Folder and move new files there:

- be careful that naming structure works because files are being deleted from one place and moved to another

In [87]:
#current directory where files are located:
print(results_directory)

'''# datetime object containing current date and time
now = datetime.now()
print("now =", now)
dt_string = now.strftime("%Y%m%d")
print("date and time =", dt_string)'''

new_dir = results_directory + Analizing_Institute + '/'
#New directory files will be moved to:
print(new_dir)

../RPCA/
../RPCA/MBARI/


In [88]:
if not os.path.exists(new_dir):
    os.makedirs(new_dir)
search = results_directory+'*.csv'
print(search)
# Get list of files present in current directory (to move):
files = glob.glob(search)
files = glob.glob(results_directory+'*.csv') + glob.glob(results_directory+'*.tsv') + glob.glob(results_directory+'*.biom')+ glob.glob(results_directory+'*ordination')
print(files)
print('Moving files to subdirectory:')
for i in files:
    file = i
    new_file = i.replace(results_directory, new_dir)
    print(file)
    print(new_file)
    os.rename(file, new_file)

../RPCA/*.csv
['../RPCA/MBARI_scores.csv', '../RPCA/MBARI_distance.csv', '../RPCA/MBARI_prop_explained.csv', '../RPCA/MBARI_loadings.csv', '../RPCA/Qiime2_meta.tsv', '../RPCA/Qiime2_asv.tsv', '../RPCA/Qiime2_taxa.tsv', '../RPCA/table.from_txt_json.biom', '../RPCA/table.w_md.biom', '../RPCA/MBARI_ordination']
Moving files to subdirectory:
../RPCA/MBARI_scores.csv
../RPCA/MBARI/MBARI_scores.csv
../RPCA/MBARI_distance.csv
../RPCA/MBARI/MBARI_distance.csv
../RPCA/MBARI_prop_explained.csv
../RPCA/MBARI/MBARI_prop_explained.csv
../RPCA/MBARI_loadings.csv
../RPCA/MBARI/MBARI_loadings.csv
../RPCA/Qiime2_meta.tsv
../RPCA/MBARI/Qiime2_meta.tsv
../RPCA/Qiime2_asv.tsv
../RPCA/MBARI/Qiime2_asv.tsv
../RPCA/Qiime2_taxa.tsv
../RPCA/MBARI/Qiime2_taxa.tsv
../RPCA/table.from_txt_json.biom
../RPCA/MBARI/table.from_txt_json.biom
../RPCA/table.w_md.biom
../RPCA/MBARI/table.w_md.biom
../RPCA/MBARI_ordination
../RPCA/MBARI/MBARI_ordination
