## Set up environment and fetch Files

In [2]:
#set up environment
import pandas as pd
import qiime2 as q2
import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib inline
# ! rm -r data ###unappend this line if you want to wipe clean the data folder
data_dir = 'data'
safe = 'permanent_dir'
! mkdir -p data
! mkdir -p permanent

In [26]:
# fetch files from polybox
! wget -r -np -nH --cut-dirs=3 -R "index.html*" "https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download" -O polybox_files.tar.gz
! tar -xzf polybox_files.tar.gz

#unzip polybox files
!mv polybox_files.tar.gz polybox_files.zip
!unzip polybox_files.zip
!rm polybox_files.zip

#untar MAGs files and store everything in data_dir
!tar -xzf applied_bioinformatics/Illumina_MAGs.tar.gz -C ./data
!tar -xzf applied_bioinformatics/PacBio_MAGs.tar.gz -C ./data
!mv applied_bioinformatics/merged_metadata_filtered.tsv ./data
!rm -r applied_bioinformatics

will be placed in the single file you specified.

--2025-10-12 11:27:10--  https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download
Resolving polybox.ethz.ch (polybox.ethz.ch)... 129.132.71.243
Connecting to polybox.ethz.ch (polybox.ethz.ch)|129.132.71.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘polybox_files.tar.gz’

polybox_files.tar.g     [       <=>          ] 492.28M   404MB/s    in 1.2s    

2025-10-12 11:27:11 (404 MB/s) - ‘polybox_files.tar.gz’ saved [516188455]

FINISHED --2025-10-12 11:27:12--
Total wall clock time: 2.2s
Downloaded: 1 files, 492M in 1.2s (404 MB/s)
gzip: stdin has more than one entry--rest ignored
tar: Child returned status 2
tar: Error is not recoverable: exiting now
Archive:  polybox_files.zip
   creating: applied_bioinformatics/
 extracting: applied_bioinformatics/.DS_Store  
 extracting: applied_bioinformatics/Illumina_MAGs.tar.gz  
 extracting: applied_bioinformatics/PacBio_M

## Create metadata df for exploration

In [3]:
from uuid import uuid4
# load metadata
metadata_df = pd.read_csv(f"{data_dir}/merged_metadata_filtered.tsv", sep="\t", index_col=0)

# rename all fasta files with UUIDs
for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for file in os.listdir(sample_path):
            if file.endswith((".fa", ".fasta")):
                old_path = os.path.join(sample_path, file)
                new_path = os.path.join(sample_path, f"{uuid4()}.fa")
                os.rename(old_path, new_path)

records = []

for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for f in os.listdir(sample_path):
            if f.endswith((".fa", ".fasta")):
                abs_path = os.path.abspath(os.path.join(sample_path, f))
                if sample_id in metadata_df.index:
                    mag_id = os.path.splitext(f)[0]  # filename without extension
                    records.append((sample_id, mag_id, abs_path))

# build dataframe
manifest_df = pd.DataFrame.from_records(records, columns=["sample-id", "mag-id", "filename"])

# save as MANIFEST (tab-separated, no index)
manifest_path = os.path.join(data_dir, "MANIFEST")
manifest_df.to_csv(manifest_path, sep=",", index=False)

print(f"MANIFEST saved to: {manifest_path}")
print(manifest_df.head())
print(manifest_df.columns)

MANIFEST saved to: data/MANIFEST
  sample-id                                mag-id  \
0      M004  e5e2e0bf-6f73-4261-9470-eea81396e53c   
1      M004  1bcb3a74-3432-4c82-b70c-d817640c31b2   
2      M004  42f6086f-58ab-4c2b-b47a-ed95b824638a   
3      M004  8b161c5a-bfae-4894-8d81-014ee3e96557   
4      M004  ad963cb1-f546-4cb9-8f61-d66070816ba6   

                                            filename  
0  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
1  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
2  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
3  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
4  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
Index(['sample-id', 'mag-id', 'filename'], dtype='object')


## EXPLORATION