In [1]:
from kmer_counting import sequence2matrix
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import math
import copy
from typing import List
import tqdm
from Bio import SeqIO

In [2]:
circ_rna_path = "dataset/human_sequence_v3.0"
file = open(circ_rna_path).readlines()
data = []
for lines in file:
    lines = lines.replace("\n","").split(" ")
    lines[0] = lines[0].split("_")
    lines[0][0] = lines[0][0].split("-", 1)
    data.append({
        "species":lines[0][0][0],
        "type": lines[0][0][1],
        "id": lines[0][1],
        "strand": lines[1]
    })
len(data)

768986

In [3]:
df = pd.DataFrame(data)
df.drop_duplicates(subset="strand", keep=False, inplace=True)
df = df[df["strand"] != "unknown"]
df.head()

Unnamed: 0,species,type,id,strand
1,hsa,RP11-206L10,1,TATCTTAAATAGTGAAGATGGAGAAATAGTCAATAATGAAGAGCAT...
2,hsa,RP11-206L10,2,TATCTTAAATAGTGAAGATGGAGAAATAGTCAATAATGAAGAGCAT...
3,hsa,SAMD11,13,GTGCGGCTGTGTCCCCTTCCCCTGCCCAACATGCTGTATGTCTGAG...
4,hsa,SAMD11,1,CCAGGACGGCAACCTTCCCACCCTCATATCCAGCGTCCACCGCAGC...
5,hsa,NOC2L,1,CCGGCGTAAAGGCCGTGCCTCTGAGCACAAAGACCAGCTCTCTCGG...


In [4]:
li, cnt = np.unique(df["type"], return_counts=True)
dicts = [[i,j] for i, j in zip(li,cnt)]
len(li)

24318

In [5]:
# take 16 of the most frequent type
total_rna_type = 8
rna_types = np.array(sorted(dicts, key=lambda x:x[1])[::-1][:total_rna_type])[:,0]
rna_types

array(['intergenic', 'TTN', 'RYR2', 'MALAT1', 'USP34', 'MACF1', 'SNHG14',
       'BIRC6'], dtype='<U21')

In [6]:
df = df[df["type"].isin(rna_types) ]
df.shape

(28946, 4)

In [7]:
total_data = 360
test_class_data = pd.concat([df[df["type"] == i].iloc[:total_data,:] for i in rna_types]).drop(["species", "id"], axis=1)
test_class_data.to_csv("test_data/classes.csv")
test_class_data.head()

Unnamed: 0,type,strand
40,intergenic,GTGCAGTGTGCAGTGCAGGCTTGAACCTGCAGGGGTCCCCCAGGAG...
195,intergenic,AGACAGGAGGTCTCCCTACGTTATCTTGGCTGACTTCTAACTTCCG...
277,intergenic,AAATATTTCTATTAGGCCACTGCAAAAGTAACTGCAAAAACCACAA...
282,intergenic,CCATGCAGGGATCTGCCTTCCTGTGGAATGATCTCCAATGATCTTC...
521,intergenic,GGTCTCACTGTCGCCCAGGCTGAAGTGCAGTGGCGTGATCACGGCT...


In [8]:
non_circ_rna_path = "dataset/noncircRNA.fa"
non_circ_rna = [str(i.seq) for i in list(SeqIO.parse(non_circ_rna_path, "fasta"))]
selected_non_circ_rna = non_circ_rna[:total_data]
df = pd.DataFrame(selected_non_circ_rna, columns=["strand"])
df.insert(0, "type", "non_circular")
df

Unnamed: 0,type,strand
0,non_circular,ACTCCCTTTGTGGAGTTCTCCAGTGCAGAGAAGATTGGCATGACTC...
1,non_circular,GTGCTCACTTTGGCAGCACATCTACTAATGACACAGAGGTTAGCAT...
2,non_circular,GTGCTTCAGCAGCACATACACTAAAATTGGAATGATACAGAGATTA...
3,non_circular,GTGCTCACTTCAGCAGCACACATACTAACACTGGTACAATAGAGAG...
4,non_circular,GTGCTCATTTCAGCAGCATATATACTAAAATTGGAATGATACAGAG...
...,...,...
355,non_circular,GTGCTTGCTCCAGTAGCACATATTCTAAAATTGGAATGATACAGAG...
356,non_circular,GTGCATACTTTGGCAGCACATATACTAAAATTGGAATGATACAGAG...
357,non_circular,GGTCCTAAAGGAACATCTGATAAAATTAGAACGATATAAAGAAGAT...
358,non_circular,GTGCTCGCTTCAGCAGCACATATACTAAAATTGGAACGATACAGAG...


In [9]:
pd.concat([test_class_data, df]).to_csv("test_data/classes.csv", index=False)