## Set up environment and fetch Files

In [4]:
#set up environment
import pandas as pd
import qiime2 as q2
import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib inline
!rm -r data ###unappend this line if you want to wipe clean the data folder
data_dir = 'data'
safe = 'permanent_dir'
! mkdir -p data
! mkdir -p permanent

In [5]:
# fetch files from polybox
! wget -r -np -nH --cut-dirs=3 -R "index.html*" "https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download" -O polybox_files.tar.gz
! tar -xzf polybox_files.tar.gz

#unzip polybox files
!mv polybox_files.tar.gz polybox_files.zip
!unzip polybox_files.zip
!rm polybox_files.zip

#untar MAGs files and store everything in data_dir
!tar -xzf applied_bioinformatics/Illumina_MAGs.tar.gz -C ./data
!tar -xzf applied_bioinformatics/PacBio_MAGs.tar.gz -C ./data
!mv applied_bioinformatics/merged_metadata_filtered.tsv ./data
!rm -r applied_bioinformatics

will be placed in the single file you specified.

--2025-10-18 09:52:09--  https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download
Resolving polybox.ethz.ch (polybox.ethz.ch)... 129.132.71.243
Connecting to polybox.ethz.ch (polybox.ethz.ch)|129.132.71.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘polybox_files.tar.gz’

polybox_files.tar.g     [      <=>           ] 492.28M   418MB/s    in 1.2s    

2025-10-18 09:52:11 (418 MB/s) - ‘polybox_files.tar.gz’ saved [516188455]

FINISHED --2025-10-18 09:52:11--
Total wall clock time: 2.1s
Downloaded: 1 files, 492M in 1.2s (418 MB/s)
gzip: stdin has more than one entry--rest ignored
tar: Child returned status 2
tar: Error is not recoverable: exiting now
Archive:  polybox_files.zip
   creating: applied_bioinformatics/
 extracting: applied_bioinformatics/.DS_Store  
 extracting: applied_bioinformatics/Illumina_MAGs.tar.gz  
 extracting: applied_bioinformatics/PacBio_M

## Create metadata df for exploration

In [6]:
from uuid import uuid4
# load metadata
metadata_df = pd.read_csv(f"{data_dir}/merged_metadata_filtered.tsv", sep="\t", index_col=0)

# rename all fasta files with UUIDs
for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for file in os.listdir(sample_path):
            if file.endswith((".fa", ".fasta")):
                old_path = os.path.join(sample_path, file)
                new_path = os.path.join(sample_path, f"{uuid4()}.fa")
                os.rename(old_path, new_path)

records = []

for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for f in os.listdir(sample_path):
            if f.endswith((".fa", ".fasta")):
                abs_path = os.path.abspath(os.path.join(sample_path, f))
                if sample_id in metadata_df.index:
                    mag_id = os.path.splitext(f)[0]  # filename without extension
                    records.append((sample_id, mag_id, abs_path))

# build dataframe
manifest_df = pd.DataFrame.from_records(records, columns=["sample-id", "mag-id", "filename"])

# save as MANIFEST (tab-separated, no index)
manifest_path = os.path.join(data_dir, "MANIFEST")
manifest_df.to_csv(manifest_path, sep=",", index=False)

print(f"MANIFEST saved to: {manifest_path}")
print(manifest_df.head())
print(manifest_df.columns)

MANIFEST saved to: data/MANIFEST
  sample-id                                mag-id  \
0      M004  d3ebee05-4bf2-438c-ac89-74384041925d   
1      M004  b9450a31-032c-471b-8f65-d8851a00fbb7   
2      M004  4713a771-395c-43a2-8fec-2e576a211ceb   
3      M004  e06f5ed7-6f01-4b84-b95a-a8087e8189a7   
4      M004  91c2ce95-deaf-433a-a69b-1fa2c08d8c16   

                                            filename  
0  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
1  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
2  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
3  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
4  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
Index(['sample-id', 'mag-id', 'filename'], dtype='object')


## EXPLORATION

In [8]:
metadata_df.columns

Index(['samp_country', 'category', 'fermented_food_type'], dtype='object')

In [9]:
manifest_df.columns

Index(['sample-id', 'mag-id', 'filename'], dtype='object')

In [10]:
merged_df = metadata_df.merge(manifest_df, left_index=True, right_on='sample-id', how='right')

In [11]:
merged_df

Unnamed: 0,samp_country,category,fermented_food_type,sample-id,mag-id,filename
0,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,d3ebee05-4bf2-438c-ac89-74384041925d,/home/jovyan/Interplanetary_Microbiome/data/Pa...
1,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,b9450a31-032c-471b-8f65-d8851a00fbb7,/home/jovyan/Interplanetary_Microbiome/data/Pa...
2,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,4713a771-395c-43a2-8fec-2e576a211ceb,/home/jovyan/Interplanetary_Microbiome/data/Pa...
3,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,e06f5ed7-6f01-4b84-b95a-a8087e8189a7,/home/jovyan/Interplanetary_Microbiome/data/Pa...
4,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,91c2ce95-deaf-433a-a69b-1fa2c08d8c16,/home/jovyan/Interplanetary_Microbiome/data/Pa...
...,...,...,...,...,...,...
1421,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,d4e2ed8b-3142-4a6e-816c-22186da93ebf,/home/jovyan/Interplanetary_Microbiome/data/Il...
1422,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,7e81080e-2789-4544-933f-d702f0336e3b,/home/jovyan/Interplanetary_Microbiome/data/Il...
1423,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,e032d84f-6ad3-47d2-a166-83bcf04b15eb,/home/jovyan/Interplanetary_Microbiome/data/Il...
1424,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,c8afd79f-1bf1-40e9-9331-3fbd8ccd830d,/home/jovyan/Interplanetary_Microbiome/data/Il...
