## Set up environment and fetch Files

In [3]:
#set up environment
import pandas as pd
import qiime2 as q2
import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib inline
#!rm -r data ###unappend this line if you want to wipe clean the data folder
data_dir = 'data'
safe = 'permanent_dir'
! mkdir -p data
! mkdir -p permanent

In [4]:
# fetch files from polybox
! wget -r -np -nH --cut-dirs=3 -R "index.html*" "https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download" -O polybox_files.tar.gz
! tar -xzf polybox_files.tar.gz

#unzip polybox files
!mv polybox_files.tar.gz polybox_files.zip
!unzip polybox_files.zip
!rm polybox_files.zip

#untar MAGs files and store everything in data_dir
!tar -xzf applied_bioinformatics/Illumina_MAGs.tar.gz -C ./data
!tar -xzf applied_bioinformatics/PacBio_MAGs.tar.gz -C ./data
!mv applied_bioinformatics/merged_metadata_filtered.tsv ./data
!rm -r applied_bioinformatics

will be placed in the single file you specified.

--2025-12-02 22:02:21--  https://polybox.ethz.ch/index.php/s/56JaAiKdGwioBKN/download
Resolving polybox.ethz.ch (polybox.ethz.ch)... 129.132.71.243
Connecting to polybox.ethz.ch (polybox.ethz.ch)|129.132.71.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘polybox_files.tar.gz’

polybox_files.tar.g     [              <=>   ] 492.28M   121MB/s    in 4.1s    

2025-12-02 22:02:26 (121 MB/s) - ‘polybox_files.tar.gz’ saved [516188455]

FINISHED --2025-12-02 22:02:27--
Total wall clock time: 5.1s
Downloaded: 1 files, 492M in 4.1s (121 MB/s)
gzip: stdin has more than one entry--rest ignored
tar: Child returned status 2
tar: Error is not recoverable: exiting now
Archive:  polybox_files.zip
   creating: applied_bioinformatics/
 extracting: applied_bioinformatics/.DS_Store  
 extracting: applied_bioinformatics/Illumina_MAGs.tar.gz  
 extracting: applied_bioinformatics/PacBio_M

## Create metadata df for exploration

In [5]:
from uuid import uuid4
# load metadata
metadata_df = pd.read_csv(f"{data_dir}/merged_metadata_filtered.tsv", sep="\t", index_col=0)

# rename all fasta files with UUIDs
for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for file in os.listdir(sample_path):
            if file.endswith((".fa", ".fasta")):
                old_path = os.path.join(sample_path, file)
                new_path = os.path.join(sample_path, f"{uuid4()}.fa")
                os.rename(old_path, new_path)

records = []

for technique in os.listdir(data_dir):
    tech_path = os.path.join(data_dir, technique)
    if not os.path.isdir(tech_path):
        continue
    for sample_id in os.listdir(tech_path):
        sample_path = os.path.join(tech_path, sample_id)
        if not os.path.isdir(sample_path):
            continue
        for f in os.listdir(sample_path):
            if f.endswith((".fa", ".fasta")):
                abs_path = os.path.abspath(os.path.join(sample_path, f))
                if sample_id in metadata_df.index:
                    mag_id = os.path.splitext(f)[0]  # filename without extension
                    records.append((sample_id, mag_id, abs_path))

# build dataframe
manifest_df = pd.DataFrame.from_records(records, columns=["sample-id", "mag-id", "filename"])

# save as MANIFEST (tab-separated, no index)
manifest_path = os.path.join(data_dir, "MANIFEST")
manifest_df.to_csv(manifest_path, sep=",", index=False)

print(f"MANIFEST saved to: {manifest_path}")
print(manifest_df.head())
print(manifest_df.columns)

MANIFEST saved to: data/MANIFEST
  sample-id                                mag-id  \
0      M004  17c50552-2e7b-4e25-b667-d2772874e86d   
1      M004  d1f9d550-67d2-4bb5-9e52-fbb071abc436   
2      M004  95547d8f-e2d2-4c3a-995e-4f3cec1e30fc   
3      M004  d132a633-c59f-4f5a-b049-144a59926482   
4      M004  c55dd5e6-48bb-48eb-aa0a-b8aaa6005d11   

                                            filename  
0  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
1  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
2  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
3  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
4  /home/jovyan/Interplanetary_Microbiome/data/Pa...  
Index(['sample-id', 'mag-id', 'filename'], dtype='object')


## EXPLORATION

In [6]:
metadata_df.columns

Index(['samp_country', 'category', 'fermented_food_type'], dtype='object')

In [7]:
manifest_df.columns

Index(['sample-id', 'mag-id', 'filename'], dtype='object')

In [10]:
merged_df = metadata_df.merge(manifest_df, left_index=True, right_on='sample-id', how='right')
merged_df.to_csv("merged_df.tsv", sep="\t", index=False)

In [9]:
merged_df

Unnamed: 0,samp_country,category,fermented_food_type,sample-id,mag-id,filename
0,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,17c50552-2e7b-4e25-b667-d2772874e86d,/home/jovyan/Interplanetary_Microbiome/data/Pa...
1,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,d1f9d550-67d2-4bb5-9e52-fbb071abc436,/home/jovyan/Interplanetary_Microbiome/data/Pa...
2,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,95547d8f-e2d2-4c3a-995e-4f3cec1e30fc,/home/jovyan/Interplanetary_Microbiome/data/Pa...
3,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,d132a633-c59f-4f5a-b049-144a59926482,/home/jovyan/Interplanetary_Microbiome/data/Pa...
4,Thailand,fermented fish,Shrimp_paste_(Ka-pi)_from_yellow_shrimp,M004,c55dd5e6-48bb-48eb-aa0a-b8aaa6005d11,/home/jovyan/Interplanetary_Microbiome/data/Pa...
...,...,...,...,...,...,...
2847,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,e9bbfe14-0e21-4b11-9574-79ec05e264dd,/home/jovyan/Interplanetary_Microbiome/data/Il...
2848,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,f88751d2-c7af-4248-bdb6-67adbc8a5c78,/home/jovyan/Interplanetary_Microbiome/data/Il...
2849,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,3a6fd3e9-d90c-4028-a24b-c96a83581958,/home/jovyan/Interplanetary_Microbiome/data/Il...
2850,Thailand,fermented legumes,Fermented_soybean_curd_(Tao-huu-yee),P003,88572f98-8cba-45ff-af14-34eb13cdaf32,/home/jovyan/Interplanetary_Microbiome/data/Il...


In [11]:
merged_df.groupby('samp_country').size()


samp_country
Benin        57
Germany       4
Laos        100
Slovenia      8
Thailand    229
dtype: int64

## qiime Import

In [1]:
!rm -r cache
!qiime tools cache-create --cache ./cache
#!find ./data -type d -name ".*" -exec rm -rf {} +


[32mCreated cache at './cache'[0m


In [7]:
!qiime tools cache-import \
    --cache ./cache \
    --key mags \
    --type "SampleData[MAGs]" \
    --input-path ./data

  import pkg_resources
[32mImported ./data as MultiMAGSequencesDirFmt to ./cache:mags[0m
[0m[?25h

## Busco evaluation

In [10]:
!qiime annotate fetch-busco-db \
    --p-lineages bacteria_odb12 \
    --o-db ./cache:busco_db \
    --verbose

  import pkg_resources
[1;32mFetching lineages: bacteria_odb12.[0m
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: busco --download_path /tmp/qiime2/jovyan/processes/1689-1761324460.08@jovyan/tmp/q2-OutPath-1d5vg6rn --download bacteria_odb12

2025-10-24 18:48:07 INFO:	Downloading information on latest versions of BUSCO data...
2025-10-24 18:48:09 INFO:	Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb12.2025-05-14.tar.gz'
2025-10-24 18:48:10 INFO:	Decompressing file '/tmp/qiime2/jovyan/processes/1689-1761324460.08@jovyan/tmp/q2-OutPath-1d5vg6rn/lineages/bacteria_odb12.tar.gz'
[1;32mDownload completed. 
Copying files from temporary directory to the final location...[0m
[32mAdded ReferenceDB[BUSCO] to cache: ./cache as: busco_db[0m
[0m[?25h

In [11]:
!qiime annotate evaluate-busco \
    --i-mags ./cache:mags \
    --i-db ./cache:busco_db \
    --p-lineage-dataset bacteria_odb12 \                                    
    --o-visualization ./results/mags.qzv \
    --o-results ./cache:busco_results \
    --verbose

IndentationError: unexpected indent (2888558882.py, line 2)