In [1]:
import polars as pl
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import numpy as np
import torch 
import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2023)


def parquet_to_fasta(parquet_file_path, fasta_file_path):
    # Read the parquet file using Polars
    df = pl.read_parquet(parquet_file_path)
    
    # Check if the required columns are present in the DataFrame


    # Convert to Pandas DataFrame
    df_pandas = df.to_pandas()
    
    # Open a file to write the FASTA format
    with open(fasta_file_path, 'w') as fasta_file:
        # Iterate through each row in the Pandas DataFrame
        for _, row in tqdm(df_pandas.iterrows()):
            sequence_id = row['sequence_id']
            sequence = row['sequence']
            sequence = sequence[26:-21]
            fasta_file.write(f">{sequence_id}\n{sequence}\n")


            

#code to convert output to pandas dataframe
# Initialize an empty list to collect rows
def read_clstr(fn = "similar_to_test.clstr"):
    rows = []

    # Read the cluster file
    with open(fn, "r") as f:
        current_cluster = None
        for line in f:
            if line.startswith(">Cluster"):
                current_cluster = int(line.split()[-1])
            else:
                # Parse the sequence line
                parts = line.strip().split("\t")
                index = int(parts[0])
                length = int(parts[1].split("nt,")[0])
                seq_id = parts[1].split(">")[-1].split("...")[0]
                is_rep = "*" in line
                identity = float(parts[-1].split("/")[-1].rstrip("%")) if "at" in line else 100.0

                # Append this as a row
                rows.append({
                    "Cluster": current_cluster,
                    "Index": index,
                    "Length": length,
                    "Sequence_ID": seq_id,
                    "Is_Representative": is_rep,
                    "Identity_To_Rep": identity
                })
    return pd.DataFrame(rows)

# Convert to a DataFrame

def read_fasta_ids(filename):
    seq_ids = set()
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('>'):
                # Extract the sequence ID from the line; adjust the parsing based on your specific ID format
                seq_id = line.split()[0].lstrip('>')
                seq_ids.add(seq_id)
    return seq_ids

def filter_clusters(group):
    return ('Train' in group['Source'].values) and ('Test' in group['Source'].values) and len(group) > 1

Let's generate a file in `fasta` format. I'll use `train_ss` for this, as it already contains unique `sequence_id` and `sequence` columns. If you prefer to use the original file, make sure to adjust the `code` to eliminate duplicate, I also removed adapter sequnces 

In [None]:
parquet_file_path = "train_ss_vienna_rna.parquet"
fasta_file_path = "train_seq.fasta"
parquet_to_fasta(parquet_file_path, fasta_file_path)

In [None]:
parquet_file_path = "test_ss_vienna_rna.parquet"
fasta_file_path = "test_seq.fasta"
parquet_to_fasta(parquet_file_path, fasta_file_path)

We now have `train_seq.fasta` and `test_seq.fasta` files. Let's run the clustering command next. The output will include two files named `similar_to_test`, but we are mainly interested in the `.clstr` file. This file clusters the `train` and `test` sequences that are, in our case, `85%` similar. Some clusters may be empty, while others will contain `sequence_ids`. Each cluster will also have a representative sequence. Additionally, there will be a column called `Identity_To_Re`p, which indicates how identical the sequences within the cluster are to the `representative` sequence.

In [None]:
#-M memoery -T threads
#sudo cd-hit-est-2d -i test_seq.fasta -i2 train_seq.fasta -c 0.88 -o similar_to_test -T 32 -M 32000

In [None]:




df = read_clstr()
train_ids = read_fasta_ids('train_seq.fasta')
test_ids = read_fasta_ids('test_seq.fasta')
#lets add column source which will iclude if sequnce belong to train or to test
df['Source'] = df['Sequence_ID'].apply(lambda x: 'Train' if x in train_ids else ('Test' if x in test_ids else 'Unknown'))

In [None]:
#lets eliminate all single clusters .. and only keep clusters that has atelast one train and test sequences in them 
filtered_df = df.groupby('Cluster').filter(filter_clusters)
#lest count how many sequnces are from train and test


In [None]:
filtered_df["Source"].value_counts()

## SPLIT GENERATION

lets try to develop validatiaon dataset, we gonna cluster train with cut off 80%

In [None]:
#!sudo cd-hit-est -i train_seq.fasta -c 0.80 -o clustered_train -T 44 -M 32000

In [2]:
df = read_clstr('clustered_train.clstr')

In [3]:
df["Cluster"].value_counts()[df["Cluster"].value_counts() > 1].shape

(93028,)

In [4]:
df["Length"].unique()

array([130, 159, 123, 108,  68])

In [13]:
# Function to read a fasta file into a DataFrame
def read_fasta_to_df(fasta_file):
    records = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        records.append({"Sequence_ID": record.id, "Sequence": str(record.seq)})
    return pd.DataFrame(records)

# Function to write a DataFrame into a fasta file
def write_to_fasta(df, fasta_file):
    with open(fasta_file, 'w') as f:
        for _, row in df.iterrows():
            f.write(f">{row['Sequence_ID']}\n")
            f.write(f"{row['Sequence']}\n")

            
full_fasta_df = read_fasta_to_df("train_seq.fasta")
            
# Get unique Cluster IDs and shuffle them
unique_clusters = df['Cluster'].unique()
np.random.shuffle(unique_clusters)

# Split cluster IDs into training and validation sets (80/20)
train_clusters = unique_clusters[:int(0.9 * len(unique_clusters))]
valid_clusters = unique_clusters[int(0.9 * len(unique_clusters)):]

# Get the corresponding rows for training and validation sets
train_df = df[df['Cluster'].isin(train_clusters)]
valid_df = df[df['Cluster'].isin(valid_clusters)]
train_df_merged = pd.merge(train_df, full_fasta_df, on="Sequence_ID", how="left")
valid_df_merged = pd.merge(valid_df, full_fasta_df, on="Sequence_ID", how="left")
write_to_fasta(train_df_merged, "train_split_fold_0.fasta")
write_to_fasta(valid_df_merged, "valid_split_fold_0.fasta")


In [14]:
train_df_merged.shape, valid_df_merged.shape

((728826, 7), (77747, 7))

In [15]:
valid_df_merged["Length"].unique(), train_df_merged["Length"].unique()

(array([130, 159, 123, 108,  68]), array([123, 130, 159, 108,  68]))

now lets run train and vlid sequnce comparison to find if we have idividual ids that are closer to test

In [16]:
#sudo cd-hit-est-2d -i train_split_fold_0.fasta -i2 valid_split_fold_0.fasta -c 0.85 -o similar_sequences_split -T 32 -M 32000

In [17]:
df_split = read_clstr('similar_sequences_split.clstr')
train_ids = read_fasta_ids('train_split_fold_0.fasta')
valid_ids = read_fasta_ids('valid_split_fold_0.fasta')
#lets add column source which will iclude if sequnce belong to train or to test
df_split['Source'] = df_split['Sequence_ID'].apply(lambda x: 'Train' if x in train_ids else ('Test' if x in valid_ids else 'Unknown'))
filtered_df_split = df_split.groupby('Cluster').filter(filter_clusters)

In [18]:
filtered_df_split["Source"].value_counts()

Test     18080
Train    10595
Name: Source, dtype: int64

it seems like we haver some sequences that are similar .. we will take the training ids and add them to valid 

In [19]:
train_ids = read_fasta_ids('train_split_fold_0.fasta')
valid_ids = read_fasta_ids('valid_split_fold_0.fasta')
valid_ids = list(valid_ids) + filtered_df_split.query("Source=='Train'")["Sequence_ID"].to_list()
train_ids = pd.DataFrame({"sequence_id":list(train_ids)}).set_index("sequence_id")
train_ids = train_ids.drop(filtered_df_split.query("Source=='Train'")["Sequence_ID"].to_list()).reset_index()
train_ids["is_train"] = True
valid_ids = pd.DataFrame({"sequence_id":valid_ids})
valid_ids["is_train"] = False

In [20]:
pd.concat([train_ids, valid_ids], ignore_index=True).to_csv("fold_split.csv", index=False)

In [21]:
split = pd.read_csv("fold_split.csv")

In [22]:
split["sequence_id"].unique().shape

(806573,)

In [23]:
df = pl.read_parquet("train_ss_vienna_rna.parquet").to_pandas()

In [24]:
df = pd.merge(df, split, on="sequence_id")

In [25]:
df["L"] = df["sequence"].map(len)

In [26]:
df.query("is_train==False")["L"].value_counts()

177    86048
170      903
206      838
115      319
155      234
Name: L, dtype: int64

In [27]:
df.query("is_train==True")["L"].value_counts()

177    698124
170     14097
115      2410
155      1939
206      1661
Name: L, dtype: int64