In [None]:
import merger_class as Merger
from pathlib import Path
import pandas as pd

import tarfile
import os

## Part 1 - decompress files
.fasta files can be packaged into a single .tar.gz archive. This part of the code is intended to extract the archive and retrieve the original .fasta files.


In [None]:
# 📁 Path of the .tar.gz archive
archive_path = r"-----------------------------------"  

# 📂 Destination folder where to extract the files
extraction_path = r"-----------------------------------"  

# 🛠️ Create the folder if it does not exist
os.makedirs(extraction_path, exist_ok = True)

# 🔓 Extract the .fasta files (or all)
with tarfile.open(archive_path, "r:gz") as tar:
    for member in tar.getmembers():
        if member.name.endswith(".fasta"):  # or remove this if to extract everything
            tar.extract(member, path=extraction_path)
    print(f"✅ Extraction completed in: {extraction_path}")

## Part 2 - merge .fasta files and create a unique file
After decompression, you may obtain multiple .fasta files. This code merges them into a single file to facilitate downstream analysis.

In [None]:
# 📁 Path of the folder containing the .fasta files
input_folder = Path(r"--------------------------------------")

# 📤 Path of the unified file to create
output_file = Path(r"--------------------------------------")

with open(output_file, 'w', encoding='utf-8') as outfile:
    merged_count = 0

    for fasta_file in input_folder.glob("*.fasta"):
        try:
            with open(fasta_file, 'r', encoding='utf-8') as infile:
                outfile.writelines(infile.readlines())
                merged_count += 1
                print(f"✅ Aggiunto: {fasta_file.name}")
        except Exception as e:
            print(f"⚠️ Errore con {fasta_file.name}: {e}")

print(f"\n 📦 Combinati {merged_count} file in: {output_file.name}")

I created a class that merges each .fasta file with its corresponding metadata and removes duplicate Phage_ID entries.
If you want to use this class for different datasets, you might need to change column names.
I saved the class in a different python file called Merger and imported it as "Merger".
The class name is Merger.

## Part 3 - use the above class to create a row dataset

In [None]:
# Define the paths for the .fasta and .tsv files
file_path = r"-----------------------------------"
meta_path = r"-----------------------------------"

# Initialize the object
merger = Merger.Merger(file_path, meta_path)

# Create the DataFrames
database, meta_file = Merger.Merger(create_database())

# This is the final dataset of interest
final_data = Merger.Merger(create_final_database(database, meta_file))

## Part 4 – Check the output raw dataset

Metadata files (.tsv) and sequence files (.fasta) often do not have the same number of entries.  
For this reason, it is important to check their lengths:

1. **If len(file.fasta) > len(file.tsv)** → the .fasta file contains duplicated Phage_ID/sequences, or the .tsv file is missing some entries.  
2. **If len(file.fasta) < len(file.tsv)** → the .tsv file contains duplicated Phage_ID/entries, or it has more entries than the .fasta file.  
3. **If len(file.fasta) = len(file.tsv)** → this is a good sign; the files are most likely consistent.  


In [None]:
print(len(database))
print(len(meta_file))
print(len(final_data))
final_data.columns

## Part 5 - save the raw dataset in .csv format
Keep in mind that the produced datasets are still **raw**. It means that it can contain protein sequences with illegal characters, missing data and low-determined/not-determined sequences. These are factors to take into account before computing embeddings by ESM-2 model.

In [None]:
# Define where to save the file
csv_output_path = r"------------------------------------"

# Save the file and print result
final_data.to_csv(csv_output_path, index = False)