In [1]:
import os
import glob
import ast
import pickle
import torch
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord

In [2]:
file = os.path.join("/home/ubuntu/data/dmel_seq", "clear_dmel_only_with_length.fa")
output_folder_path = "/home/ubuntu/data/dmel_seq"
# NOTE: the buffer file need not be created beforehands. Just write the path
#       and the file name here. The file would be created by the system.

In [3]:
data = list(SeqIO.parse(file,"fasta"))

In [4]:
# number of sequences
len(data)

7131

In [5]:
seq = data[0]
seq.description.split('|') # regionid, expression, species id, strand, length

['VT0002', '0', 'dmel', '-', '642']

In [6]:
seq.description

'VT0002|0|dmel|-|642'

### Find duplicats

In [7]:
regionids = np.array([seq.description.split('|')[0] for seq in data])
unique_regions, counts = np.unique(regionids, return_counts=True)

len(unique_regions)

7131

In [8]:
rep_regions_idx = [idx for idx in range(len(unique_regions)) if counts[idx] > 1]

In [9]:
len(rep_regions_idx)

0

In [10]:
[unique_regions[idx] for idx in rep_regions_idx]

[]

In [11]:
[i for i in counts if i > 1]

[]

In [12]:
VT15991 = []
for idx, seq in enumerate(data):
    print(seq.description.split('|'))
    regionid, _, _, _, _ =seq.description.split('|')
    if regionid == 'VT15991':
        VT15991.append(data[idx])

VT15991

['VT0002', '0', 'dmel', '-', '642']
['VT0003', '1', 'dmel', '-', '605']
['VT0004', '0', 'dmel', '-', '994']
['VT0005', '0', 'dmel', '-', '2329']
['VT0006', '1', 'dmel', '-', '2088']
['VT0007', '1', 'dmel', '-', '2207']
['VT0008', '0', 'dmel', '-', '2227']
['VT0009', '0', 'dmel', '-', '631']
['VT0010', '0', 'dmel', '-', '396']
['VT0011', '0', 'dmel', '-', '1048']
['VT0013', '1', 'dmel', '-', '726']
['VT0015', '0', 'dmel', '-', '728']
['VT0016', '0', 'dmel', '-', '595']
['VT0017', '0', 'dmel', '-', '662']
['VT0018', '0', 'dmel', '-', '1415']
['VT0019', '0', 'dmel', '-', '2095']
['VT0020', '0', 'dmel', '-', '2124']
['VT0021', '0', 'dmel', '-', '2074']
['VT0022', '0', 'dmel', '-', '2088']
['VT0025', '1', 'dmel', '-', '2057']
['VT0104', '0', 'dmel', '-', '2230']
['VT0105', '0', 'dmel', '-', '2198']
['VT0106', '0', 'dmel', '-', '2113']
['VT0107', '0', 'dmel', '-', '2251']
['VT0108', '1', 'dmel', '-', '2095']
['VT0109', '1', 'dmel', '-', '2253']
['VT0112', '1', 'dmel', '-', '446']
['VT0129', 

['VT28604', '1', 'dmel', '-', '2167']
['VT28605', '0', 'dmel', '-', '2072']
['VT28606', '1', 'dmel', '-', '1917']
['VT28607', '1', 'dmel', '-', '2251']
['VT2865', '0', 'dmel', '-', '2138']
['VT28670', '1', 'dmel', '-', '2327']
['VT28744', '0', 'dmel', '-', '2083']
['VT28800', '1', 'dmel', '-', '2217']
['VT28860', '0', 'dmel', '-', '954']
['VT28861', '1', 'dmel', '-', '2939']
['VT28863', '0', 'dmel', '-', '2141']
['VT28865', '0', 'dmel', '-', '2146']
['VT28868', '0', 'dmel', '-', '2098']
['VT28869', '0', 'dmel', '-', '2106']
['VT28872', '0', 'dmel', '-', '2208']
['VT28873', '0', 'dmel', '-', '2239']
['VT28913', '1', 'dmel', '-', '2266']
['VT29029', '0', 'dmel', '-', '2768']
['VT29030', '0', 'dmel', '-', '1003']
['VT29043', '0', 'dmel', '-', '2791']
['VT29044', '1', 'dmel', '-', '2081']
['VT29045', '1', 'dmel', '-', '2101']
['VT29046', '1', 'dmel', '-', '2145']
['VT29047', '1', 'dmel', '-', '751']
['VT29051', '1', 'dmel', '-', '2124']
['VT29052', '1', 'dmel', '-', '2099']
['VT29054', '1'

['VT48935', '1', 'dmel', '-', '2187']
['VT48936', '1', 'dmel', '-', '2054']
['VT48937', '1', 'dmel', '-', '2091']
['VT48938', '1', 'dmel', '-', '2086']
['VT48939', '1', 'dmel', '-', '2201']
['VT48940', '0', 'dmel', '-', '852']
['VT48942', '1', 'dmel', '-', '2140']
['VT48943', '1', 'dmel', '-', '2143']
['VT48944', '1', 'dmel', '-', '2150']
['VT48946', '1', 'dmel', '-', '2255']
['VT48947', '1', 'dmel', '-', '2140']
['VT48992', '0', 'dmel', '-', '2139']
['VT4901', '0', 'dmel', '-', '2184']
['VT49025', '0', 'dmel', '-', '2255']
['VT4904', '0', 'dmel', '-', '2210']
['VT4905', '1', 'dmel', '-', '2095']
['VT49105', '0', 'dmel', '-', '2130']
['VT49109', '0', 'dmel', '-', '2227']
['VT49111', '0', 'dmel', '-', '2262']
['VT49112', '0', 'dmel', '-', '2142']
['VT49113', '0', 'dmel', '-', '2077']
['VT49115', '0', 'dmel', '-', '2122']
['VT49117', '0', 'dmel', '-', '2249']
['VT49118', '1', 'dmel', '-', '2075']
['VT49119', '1', 'dmel', '-', '2142']
['VT49120', '0', 'dmel', '-', '2093']
['VT49121', '0',

[SeqRecord(seq=Seq('ACTATATATTTTCCACATTTTTTCCAAACTTTTTTCTTATATTTTTTTTTGCGA...GCC', SingleLetterAlphabet()), id='VT15991|0|dmel|-|2113', name='VT15991|0|dmel|-|2113', description='VT15991|0|dmel|-|2113', dbxrefs=[])]

In [13]:
VT15971 = []
for idx, seq in enumerate(data):
    regionid, _, _, _, _ =seq.description.split('|')
    if regionid == 'VT15971':
        VT15971.append(data[idx])

VT15971

[SeqRecord(seq=Seq('CATTGCTCAGCCTAGCGCGAAAAAGCCAGAAAGTAAGTTCCGATTGCCTAACGA...CGC', SingleLetterAlphabet()), id='VT15971|0|dmel|-|2232', name='VT15971|0|dmel|-|2232', description='VT15971|0|dmel|-|2232', dbxrefs=[])]