In [None]:
import pandas as pd
import numpy as np
import gc
import os

from glob import glob

## Part 1 - Retain inoviridae and microviridae only
Caudovirales is the most abundant phage class across all datasets available on PhagesScope. For this reason, I developed this code to specifically collect the Inoviridae and Microviridae classes present in all datasets downloadable from PhagesDB. The ultimate goal is to create a dataset that contains classes Microviridae, Inoviridae and Caudovirales only, with a reasonable amount of data for each class.

In [None]:
# Chane the folder path with the one where yout datasets are retained
folder_path = r"-------- FOLDER WITH ALL DATASETS FROM WHICH COLLECT DATA -------------------"
files = glob(os.path.join(folder_path, "*.csv"))

categories = ['inoviridae', 'microviridae']
data_collected = []

for f in files:
    data = pd.read_csv(f)
    print(f'Working with file {f}')

    for i in range(len(data)):
        if str(data.loc[i, 'Taxonomy']).strip().lower() in categories:
            data_collected.append(data.iloc[i])

In [None]:
# Visualizza un'anteprima
file = pd.DataFrame(data_collected).reset_index()
file

## Part 2 - Merge PhagesDB with other samples
Once classes Microviridae and Inoviridae have been collected, It has been decided to merge them with PhageDB dataset. This was a purely personal choice.

In [None]:
# Change the file path with the one where PhagesDB is
file_path = r"----------------PHAGEDB_FILE_PATH------------------------"
file_2 = pd.read_csv(file_path)

In [None]:
dataset = pd.concat([file_2, file], ignore_index=True)
dataset

In [None]:
# Change the file_path with the one where you intend to save your dataset
file_path = r'-----------------------------------------------------------'
dataset.to_csv(file_path)

## Part 3 - illegal sequences
Use this part of code to remove illegal sequences (sequences with illegal character not beloging to the standard amino acid alphabet). This part was performed at the previous stages too, but it may come in handy to have this part of the code here too.

In [None]:
file_path = r"-------------------MIXED_FILE_PATH------------------------"
file = pd.read_csv(file_path)

In [None]:
import re

# Function to identify valid sequences --> no O and U
def is_valid_sequence(seq, allowed = "ACDEFGHIKLMNPQRSTVWY"):
    return re.fullmatch(f"[{allowed}]+", seq) is not None

def clean_invalid_sequences(input_path, output_path, invalid_output_path):
    df = pd.read_csv(input_path)

    # Validity mask
    valid_mask = df["Sequence"].apply(is_valid_sequence)

    # Separation
    valid_df = df[valid_mask].reset_index(drop=True)
    invalid_df = df[~valid_mask].reset_index(drop=True)

    # Saving
    valid_df.to_csv(output_path, index=False)
    invalid_df.to_csv(invalid_output_path, index=False)

    print(f"✅ Valid sequences: {len(valid_df)} saved in {output_path}")
    print(f"❌ Invalid sequences: {len(invalid_df)} saved in {invalid_output_path}")

In [None]:
clean_invalid_sequences(
    input_path = file_path,
    output_path = r'--------------------------------------',
    invalid_output_path = r'--------------------------------------'
)

## Part 4 - retain illegal characters only

In [None]:
file_path_2 = r"------------------------------"
file_2 = pd.read_csv(file_path_2)

In [None]:
# Define a function to remove valid characters and retain only the invalid ones
def rm_valid_characters(seq):
    allowed = "ACDEFGHIKLMNPQRSTVWY"
    return "".join([char for char in seq if char not in allowed])
    

In [None]:
file_2["Sequence"] = file_2["Sequence"].apply(rm_valid_characters)
file_2.to_csv(r"-----------------------------------------------")