In [None]:
from Bio import SeqIO
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np
from Bio import pairwise2
import pandas as pd
from collections import defaultdict
import multiprocessing
import itertools
# Step 1: Read FASTA File and Convert Sequences to Feature Vectors

# Function to convert a sequence to a k-mer frequency vector
def seq_to_kmer_freq(seq, k=3):
    kmer_list = [seq[i:i+k] for i in range(len(seq) - k + 1)]
    kmer_freq = Counter(kmer_list)
    return np.array([kmer_freq.get(kmer, 0) for kmer in all_kmers])

# Read sequences
fasta_file = "../data/sequence_libraries/pseudoknot90_puzzle_11387276.tsv.RNA_sequences.fa"
sequences = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    sequences[record.id] = str(record.seq)

# Generate all possible k-mers of length 3
nucleotides = ['A', 'C', 'G', 'U']
all_kmers = [''.join(p) for p in itertools.product(nucleotides, repeat=3)]

# Convert sequences to k-mer frequency vectors
X = np.array([seq_to_kmer_freq(seq) for seq in sequences.values()])

# Step 2: Cluster Sequences Using k-means
kmeans = KMeans(n_clusters=10)  # Choose an appropriate number of clusters
labels = kmeans.fit_predict(X)

# Group sequences by their cluster label
clustered_sequences = defaultdict(list)
for label, seq_id in zip(labels, sequences.keys()):
    clustered_sequences[label].append(seq_id)

# Step 3: Calculate Pairwise Sequence Similarities Within Each Cluster

def calc_similarities(cluster):
    seq_ids = clustered_sequences[cluster]
    similarities = {}
    for i, seq_id1 in enumerate(seq_ids):
        sim_scores = []
        for j, seq_id2 in enumerate(seq_ids):
            if i != j:
                alignment = pairwise2.align.globalxx(sequences[seq_id1], sequences[seq_id2])
                sim_scores.append((seq_id2, alignment[0].score))
        similarities[seq_id1] = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:5]
    return similarities

# Step 4: Use Multiprocessing to Speed Up the Calculations with 8 CPUs
with multiprocessing.Pool(processes=8) as pool:
    all_similarities = pool.map(calc_similarities, clustered_sequences.keys())


# Convert the result to a Pandas DataFrame and save as CSV
df_rows = [(seq_id, *sim) for cluster_sim in all_similarities for seq_id, top5 in cluster_sim.items() for sim in top5]
df = pd.DataFrame(df_rows, columns=["seq_id", "similar_seq_id", "score"])
#df.to_csv("top5_similar_sequences.csv", index=False)


In [None]:
df.query("score>90")

In [None]:
df.to_csv("top5_similar_sequences.csv", index=False)

In [None]:
df.groupby('seq_id').agg(list)

In [None]:
import pandas as pd
import polars as pl

In [None]:
df = pd.read_csv("top5_similar_sequences.csv")

In [None]:
df['score'] = df['score']/157

In [None]:
df.sort_values(by='score',ascending=False, inplace=True)

In [None]:
df.query("score>0.95")['seq_id'].unique().shape

In [None]:
df['seq_id'].unique().shape

In [None]:
from Bio import SeqIO
from sklearn.cluster import KMeans
from collections import Counter
import numpy as np
from Bio import pairwise2
import pandas as pd
from collections import defaultdict
import multiprocessing


In [None]:
fasta_file = "../data/sequence_libraries/pseudoknot90_puzzle_11387276.tsv.RNA_sequences.fa"
sequences = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    sequences[record.id] = str(record.seq)

In [None]:
seq1= sequences['11445334']
seq2 = sequences['11445332']

In [None]:
df.head(40)

In [None]:
import matplotlib.pyplot as plt

def plot_nucleotide_match(seq1, seq2):
    min_length = min(len(seq1), len(seq2))

    colors = ['g' if seq1[i] == seq2[i] else 'r' for i in range(min_length)]

    plt.figure(figsize=(10, 1))  # Width x Height
    plt.ylim(0.5, 1.5)  # Adjust the y-axis limits

    plt.scatter(range(min_length), [1]*min_length, c=colors)
    plt.yticks([])  # Hide the y-axis
    plt.xlabel('Nucleotide Position')
    plt.title('Nucleotide Match between seq1 and seq2')



    plt.show()


In [None]:
for i, k in zip(df["seq_id"], df["similar_seq_id"]):
    plot_nucleotide_match(sequences[str(i)], sequences[str(k)])
    plt.pause(0.1)

In [None]:
sequences[str(i)]

In [None]:
!ls 

In [None]:
from Bio import SeqIO
import pandas as pd

fasta_file = "../data/sequence_libraries/pseudoknot90_puzzle_11387276.tsv.RNA_sequences.fa"
sequences = []

for record in SeqIO.parse(fasta_file, "fasta"):
    sequences.append({"sequence_id": record.id, "sequence": str(record.seq)})

df = pd.DataFrame(sequences)
from collections import defaultdict

clstr_file = "cdhit_output.clstr"

# Initialize a dictionary to hold sequence IDs and their corresponding cluster IDs
seq_to_cluster = defaultdict(str)

# Read and parse the .clstr file
current_cluster_id = None
with open(clstr_file, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith(">Cluster"):
            current_cluster_id = line.split(" ")[1]
        else:
            sequence_id = line.split(" ")[1].split(">")[1].split("...")[0]
            seq_to_cluster[sequence_id] = current_cluster_id
            
from Bio import pairwise2
from collections import defaultdict
import numpy as np

def calculate_similarity(sub_df):
    sequences = sub_df['sequence'].tolist()
    sequence_ids = sub_df['sequence_id'].tolist()
    avg_similarity_scores = defaultdict(float)
    
    for i in range(len(sequences)):
        scores = []
        for j in range(len(sequences)):
            if i != j:
                alignments = pairwise2.align.globalxx(sequences[i], sequences[j])
                scores.append(alignments[0].score / min(len(sequences[i]), len(sequences[j])))  # Normalized by sequence length
        avg_similarity_scores[sequence_ids[i]] = np.mean(scores)
        
    return avg_similarity_scores

import matplotlib.pyplot as plt
import numpy as np

def plot_sequence_similarity(sequences):
    num_sequences = len(sequences)
    sequence_length = len(sequences[0])  # Assumes all sequences have the same length
    
    fig, ax = plt.subplots(figsize=(20, num_sequences))
    
    for i, seq in enumerate(sequences):
        for j in range(sequence_length):
            nucleotide = seq[j]
            
            # Check if the nucleotide at this position is the same in all sequences
            if all(s[j] == nucleotide for s in sequences):
                color = 'g'  # green
            else:
                color = 'r'  # red
            
            ax.add_patch(plt.Rectangle((j, i), 1, 1, color=color))

    ax.set_xlim(0, sequence_length)
    ax.set_ylim(0, num_sequences)
    ax.set_aspect('auto')
    
    plt.show()


In [None]:
df["cluster_id"] = df["sequence_id"].map(seq_to_cluster)
df["cluster_id"] = df["cluster_id"].apply(int)

In [None]:
df.query("cluster_id==1")

In [None]:
df

In [None]:
calculate_similarity(df.query("cluster_id==3"))

In [None]:
plot_sequence_similarity(df.query("cluster_id==1")['sequence'].tolist())

In [None]:
np.unique(df["cluster_id"].values).shape

In [8]:
import polars as pl

def parquet_to_fasta(parquet_file_path, fasta_file_path):
    # Read the parquet file using Polars
    df = pl.read_parquet(parquet_file_path)
    
    # Check if the required columns are present in the DataFrame
    if "sequence_id" not in df.columns or "sequence" not in df.columns:
        raise ValueError("The Parquet file must contain 'sequence_id' and 'sequence' columns.")

    # Open a file to write the FASTA format
    with open(fasta_file_path, 'w') as fasta_file:
        # Iterate through each row in the DataFrame
        for index, row in df.iterrows():
            sequence_id = row['sequence_id']
            sequence = row['sequence']

            # Write each sequence to the FASTA file in the format:
            # >sequence_id
            # sequence
            fasta_file.write(f">{sequence_id}\n")
            fasta_file.write(f"{sequence}\n")


In [7]:
    pl.read_parquet("train_ss_vienna_rna.parquet")

sequence_id,sequence,ss_full,ss_full_mfe,ss_roi,ss_roi_mfe
str,str,str,f64,str,f64
"""182b58b24008""","""GGGAACGACUCGAG…",""".(((.((((((...…",-34.200001,"""..............…",-23.1
"""cd185aaf6004""","""GGGAACGACUCGAG…",""".....((((((...…",-48.0,"""(((((((..(((..…",-37.299999
"""78262f2fb07a""","""GGGAACGACUCGAG…",""".....((((((...…",-38.799999,"""....((((.(((((…",-28.200001
"""3418ecb4a0df""","""GGGAACGACUCGAG…",""".....((((((...…",-63.099998,"""((((((.....)))…",-50.200001
"""d3f087530d0b""","""GGGAACGACUCGAG…",""".....((((((...…",-65.599998,"""((((((...((((.…",-55.299999
"""29a830d3f7e9""","""GGGAACGACUCGAG…",""".(((.((((((...…",-41.900002,"""..............…",-30.6
"""f00e92d44902""","""GGGAACGACUCGAG…",""".....((((((...…",-49.0,"""(((.(((((...((…",-38.900002
"""22d14a849330""","""GGGAACGACUCGAG…","""((...((((((...…",-49.099998,"""(((((....)))))…",-36.799999
"""2454e1d567ce""","""GGGAACGACUCGAG…","""((((.((((((...…",-48.799999,"""....(((((((((.…",-36.0
"""29f02a9d8de1""","""GGGAACGACUCGAG…","""(((((((((((...…",-62.299999,""".........(((((…",-50.299999
