In [1]:
import polars as pl
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import numpy as np
import torch 
import random
import os
import seaborn as sbn
from sklearn.model_selection import train_test_split

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2023)


def parquet_to_fasta(parquet_file_path, fasta_file_path):
    # Read the parquet file using Polars
    df = pl.read_parquet(parquet_file_path)
    
    # Check if the required columns are present in the DataFrame


    # Convert to Pandas DataFrame
    df_pandas = df.to_pandas().drop_duplicates(subset=['sequence_id'])
    
    # Open a file to write the FASTA format
    with open(fasta_file_path, 'w') as fasta_file:
        # Iterate through each row in the Pandas DataFrame
        for _, row in tqdm(df_pandas.iterrows()):
            sequence_id = row['sequence_id']
            sequence = row['sequence']
            sequence = sequence[26:-21]
            fasta_file.write(f">{sequence_id}\n{sequence}\n")


            

#code to convert output to pandas dataframe
# Initialize an empty list to collect rows
def read_clstr(fn = "similar_to_test.clstr"):
    rows = []

    # Read the cluster file
    with open(fn, "r") as f:
        current_cluster = None
        for line in f:
            if line.startswith(">Cluster"):
                current_cluster = int(line.split()[-1])
            else:
                # Parse the sequence line
                parts = line.strip().split("\t")
                index = int(parts[0])
                length = int(parts[1].split("nt,")[0])
                seq_id = parts[1].split(">")[-1].split("...")[0]
                is_rep = "*" in line
                identity = float(parts[-1].split("/")[-1].rstrip("%")) if "at" in line else 100.0

                # Append this as a row
                rows.append({
                    "cluster": current_cluster,
                    "index": index,
                    "length": length,
                    "sequence_id": seq_id,
                    "is_Representative": is_rep,
                    "identity_To_Rep": identity
                })
    return pd.DataFrame(rows)

# Convert to a DataFrame

def read_fasta_ids(filename):
    seq_ids = set()
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('>'):
                # Extract the sequence ID from the line; adjust the parsing based on your specific ID format
                seq_id = line.split()[0].lstrip('>')
                seq_ids.add(seq_id)
    return seq_ids

def filter_clusters(group):
    return ('Train' in group['source'].values) and ('Test' in group['source'].values) and len(group) > 1



Let's generate a file in `fasta` format. I'll use `train_ss` for this, as it already contains unique `sequence_id` and `sequence` columns. If you prefer to use the original file, make sure to adjust the `code` to eliminate duplicate, I also removed adapter sequnces 

In [None]:
parquet_file_path = "train_ss_vienna_rna.parquet"
fasta_file_path = "train_seq.fasta"
parquet_to_fasta(parquet_file_path, fasta_file_path)

In [None]:
parquet_file_path = "test_ss_vienna_rna.parquet"
fasta_file_path = "test_seq.fasta"
parquet_to_fasta(parquet_file_path, fasta_file_path)

We now have `train_seq.fasta` and `test_seq.fasta` files. Let's run the clustering command next. The output will include two files named `similar_to_test`, but we are mainly interested in the `.clstr` file. This file clusters the `train` and `test` sequences that are, in our case, `85%` similar. Some clusters may be empty, while others will contain `sequence_ids`. Each cluster will also have a representative sequence. Additionally, there will be a column called `Identity_To_Re`p, which indicates how identical the sequences within the cluster are to the `representative` sequence.

In [None]:
#-M memoery -T threads
#sudo cd-hit-est-2d -i test_seq.fasta -i2 train_seq.fasta -c 0.88 -o similar_to_test -T 32 -M 32000

In [None]:

df = read_clstr()
train_ids = read_fasta_ids('train_seq.fasta')
test_ids = read_fasta_ids('test_seq.fasta')
#lets add column source which will iclude if sequnce belong to train or to test
df['source'] = df['sequence_ID'].apply(lambda x: 'Train' if x in train_ids else ('Test' if x in test_ids else 'Unknown'))

In [None]:
#lets eliminate all single clusters .. and only keep clusters that has atelast one train and test sequences in them 
filtered_df = df.groupby('cluster').filter(filter_clusters)
#lest count how many sequnces are from train and test


In [None]:
filtered_df["source"].value_counts()

## SPLIT GENERATION

lets try to develop validatiaon dataset, we gonna cluster train with cut off 80%

In [4]:
#!sudo cd-hit-est -i train_seq.fasta -c 0.80 -o clustered_train -T 44 -M 32000

In [5]:
#read the clusterd file 
df = read_clstr('clustered_train.clstr')
df.shape

(806573, 6)

In [6]:
df

Unnamed: 0,cluster,index,length,sequence_id,is_Representative,identity_To_Rep
0,0,0,130,d62116c4c8f0,False,83.85
1,0,1,130,2e291021606c,False,82.31
2,0,2,130,9c4e99a4c2ba,False,89.23
3,0,3,130,cc2c6b77b4bf,False,83.85
4,0,4,130,f2eb45fbe514,False,86.92
...,...,...,...,...,...,...
806568,317733,0,68,4eb60e538293,True,100.00
806569,317734,0,68,21d1734994ca,True,100.00
806570,317735,0,68,a8404fb3410c,True,100.00
806571,317736,0,68,230e324ae8dc,True,100.00


In [7]:
print(f"unique_number_of_clusters: {df['cluster'].value_counts()[df['cluster'].value_counts() > 1].shape}")

unique_number_of_clusters: (93029,)


In [8]:
df["length"].unique()

array([130, 159, 123, 108,  68])

In [9]:
#lest get SNR data 
sr_df = pd.read_parquet('../data/train_data.parquet')
sr_df["L"] = sr_df["sequence"].map(len)
sr_df = sr_df.groupby("sequence_id")[["SN_filter", "L"]].agg("mean").reset_index()

In [10]:
sr_df

Unnamed: 0,sequence_id,SN_filter,L
0,00005a0b365f,0.0,177.0
1,00006c296445,0.0,177.0
2,0000c9fe9c6f,0.0,177.0
3,0000d87cab97,1.0,177.0
4,0000dadc9e14,0.0,177.0
...,...,...,...
806568,ffffde700333,0.5,177.0
806569,ffffe6075b10,0.0,177.0
806570,ffffea5adcdc,0.0,177.0
806571,fffff1a0b9c7,0.5,177.0


In [11]:
#merge cluster df and snr df
df = pd.merge(df, sr_df, on="sequence_id")
df.drop(columns=["index", "length"], inplace=True)

In [12]:
df

Unnamed: 0,cluster,sequence_id,is_Representative,identity_To_Rep,SN_filter,L
0,0,d62116c4c8f0,False,83.85,1.0,177.0
1,0,2e291021606c,False,82.31,1.0,177.0
2,0,9c4e99a4c2ba,False,89.23,0.5,177.0
3,0,cc2c6b77b4bf,False,83.85,0.0,177.0
4,0,f2eb45fbe514,False,86.92,0.0,177.0
...,...,...,...,...,...,...
806568,317733,4eb60e538293,True,100.00,1.0,115.0
806569,317734,21d1734994ca,True,100.00,1.0,115.0
806570,317735,a8404fb3410c,True,100.00,0.9,115.0
806571,317736,230e324ae8dc,True,100.00,1.0,115.0


In [13]:
def generate_split(df):
    splits = []
    for _ in range(5):
        # Getting unique clusters
        unique_clusters = df['cluster'].unique()

        # Splitting unique clusters into train and test
        train_clusters, valid_clusters = train_test_split(unique_clusters, test_size=0.14, random_state=None)

        # Splitting the original df based on train and test clusters
        train_df = df[df['cluster'].isin(train_clusters)]
        valid_df = df[df['cluster'].isin(valid_clusters)]

        splits.append((train_df, valid_df))
    return splits

splits = generate_split(df)

In [14]:
for split in splits:
    t, v = split
    print('___')
    print(f"Train: {t.shape}, Valid: {v.shape}")
    print(f"Train L: {t['L'].unique()}, Valid L: {v['L'].unique()}")
    print(f"Train SNR: {t.query('SN_filter>0.48').shape[0]}, Valid SNR: {v.query('SN_filter>0.48').shape[0]}")
    print(f"Train SNR and L: {t.query('SN_filter>0.48')['L'].unique()}, Valid SNR and L: {v.query('SN_filter>0.48')['L'].unique()}")
    

___
Train: (688989, 6), Valid: (117584, 6)
Train L: [170. 177. 206. 155. 115.], Valid L: [177. 206. 170. 155. 115.]
Train SNR: 206939, Valid SNR: 35632
Train SNR and L: [170. 177. 206. 155. 115.], Valid SNR and L: [177. 206. 170. 155. 115.]
___
Train: (697349, 6), Valid: (109224, 6)
Train L: [177. 206. 170. 155. 115.], Valid L: [170. 177. 206. 155. 115.]
Train SNR: 208483, Valid SNR: 34088
Train SNR and L: [177. 206. 170. 155. 115.], Valid SNR and L: [170. 177. 206. 155. 115.]
___
Train: (686176, 6), Valid: (120397, 6)
Train L: [177. 206. 170. 155. 115.], Valid L: [170. 177. 206. 155. 115.]
Train SNR: 206297, Valid SNR: 36274
Train SNR and L: [177. 206. 170. 155. 115.], Valid SNR and L: [170. 177. 206. 155. 115.]
___
Train: (699889, 6), Valid: (106684, 6)
Train L: [177. 206. 170. 155. 115.], Valid L: [177. 170. 155. 115.]
Train SNR: 210973, Valid SNR: 31598
Train SNR and L: [177. 206. 170. 155. 115.], Valid SNR and L: [177. 170. 155. 115.]
___
Train: (690096, 6), Valid: (116477, 6)
Tra

In [15]:
# Function to read a fasta file into a DataFrame
def read_fasta_to_df(fasta_file):
    records = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        records.append({"sequence_id": record.id, "sequence": str(record.seq)})
    return pd.DataFrame(records)

# Function to write a DataFrame into a fasta file
def write_to_fasta(df, fasta_file):
    with open(fasta_file, 'w') as f:
        for _, row in df.iterrows():
            f.write(f">{row['sequence_id']}\n")
            f.write(f"{row['sequence']}\n")

full_fasta_df = read_fasta_to_df("train_seq.fasta")
train_df, valid_df = splits[0]
train_df_merged = pd.merge(train_df, full_fasta_df, on="sequence_id", how="left")
valid_df_merged = pd.merge(valid_df, full_fasta_df, on="sequence_id", how="left")
write_to_fasta(train_df_merged, "train_split_fold_0.fasta")
write_to_fasta(valid_df_merged, "valid_split_fold_0.fasta")

now lets run train and vlid sequnce comparison to find if we have idividual ids that are closer to test

In [None]:
#sudo cd-hit-est-2d -i train_split_fold_0.fasta -i2 valid_split_fold_0.fasta -c 0.85 -o similar_sequences_split -T 32 -M 32000

In [3]:
df_split_clusters = read_clstr('similar_sequences_split.clstr')
train_ids = read_fasta_ids('train_split_fold_0.fasta')
valid_ids = read_fasta_ids('valid_split_fold_0.fasta')
#lets add column source which will iclude if sequnce belong to train or to test
df_split_clusters['source'] = df_split_clusters['sequence_id'].apply(lambda x: 'Train' if x in train_ids else ('Test' if x in valid_ids else 'Unknown'))
filtered_df_split = df_split_clusters.groupby('cluster').filter(filter_clusters)

In [5]:
filtered_df_split["source"].value_counts()

Test     27538
Train    14007
Name: source, dtype: int64

it seems like we haver some sequences that are similar .. we will take the training ids and add them to valid 

In [6]:
train_ids = read_fasta_ids('train_split_fold_0.fasta')
valid_ids = read_fasta_ids('valid_split_fold_0.fasta')
valid_ids = list(valid_ids) + filtered_df_split.query("source=='Train'")["sequence_id"].to_list()
train_ids = pd.DataFrame({"sequence_id":list(train_ids)}).set_index("sequence_id")
train_ids = train_ids.drop(filtered_df_split.query("source=='Train'")["sequence_id"].to_list()).reset_index()
train_ids["is_train"] = True
valid_ids = pd.DataFrame({"sequence_id":valid_ids})
valid_ids["is_train"] = False

In [7]:
pd.concat([train_ids, valid_ids], ignore_index=True).to_csv("fold_split.csv", index=False)

# DONE

In [2]:
df = pd.read_parquet('../data/train_data.parquet')
df["L"] = df["sequence"].map(len)   
split = pd.read_csv("fold_split.csv")
df = pd.merge(df, split, on='sequence_id')
df_train = df.query('is_train==True').reset_index(drop=True)
df_valid = df.query('is_train==False').reset_index(drop=True)


In [3]:
df_train.shape, df_valid.shape

((1375474, 421), (268206, 421))

In [4]:
df_valid.query('SN_filter<1').query("experiment_type=='2A3_MaP'")["L"].value_counts()

177    99636
155      142
115      136
170       79
206       13
Name: L, dtype: int64

In [5]:
df_valid["L"].value_counts()

177    254358
170      5518
115      4450
155      2196
206      1684
Name: L, dtype: int64

In [6]:
set(df_train["sequence_id"].to_list()).intersection(set(df_valid["sequence_id"].to_list()))

set()

In [33]:
df_train = set(pd.read_parquet('../data/train_data.parquet')['sequence_id'].unique())
df_test = set(pd.read_csv('../data/test_sequences.csv')['sequence_id'].unique())
combined = list(df_train.intersection(df_test))
split = pd.read_csv('fold_split.csv')
split.set_index('sequence_id', inplace=True)
split.drop(combined, inplace=True)
split.reset_index(inplace=True)
split = pd.concat([split, pd.DataFrame({"sequence_id": combined, "is_train" : False})], ignore_index=True)
split.to_csv("fold_split.csv", index=False)

In [36]:
split['is_train'].value_counts()/split.shape[0]

True     0.797718
False    0.202282
Name: is_train, dtype: float64