In [66]:
from Bio import SeqIO
from Bio.Seq import Seq
import os
import statistics as stat

In [67]:
bed_dir = "/home/transposons/Genomes/TAIR10/BedDir/GsBed/"
fasta_dir = "/home/transposons/Genomes/TAIR10/Fasta/"
assert os.path.exists(bed_dir)
assert os.path.exists(fasta_dir)

In [68]:
bed_list = [bed_dir + "/" + name for name in os.listdir(bed_dir)]
fasta_list = [fasta_dir + "/" + name for name in os.listdir(fasta_dir)]
assert len(bed_list) == len(fasta_list)

In [69]:
def sort_key(file_name):
    data = file_name.split('_')  
    genome_name, chr_name = '_'.join(data[:-1]), data[-1].split('.')[0] # Extract the genome and chromosome names from the file name
    return (genome_name, int(chr_name.replace('chr', '')))  # Sort by genome name, then chromosome number

In [70]:
bed_list.sort(key=sort_key)
fasta_list.sort(key=sort_key)

In [71]:
for i in range(len(bed_list)):
    print(os.path.basename(bed_list[i]), os.path.basename(fasta_list[i]))

TAIR10_chr1.bed TAIR10_chr1.fa
TAIR10_chr2.bed TAIR10_chr2.fa
TAIR10_chr3.bed TAIR10_chr3.fa
TAIR10_chr4.bed TAIR10_chr4.fa
TAIR10_chr5.bed TAIR10_chr5.fa


In [92]:
def get_alignment(seq1, seq2, match, mismatch, gap_open, gap_continue):
    score_matrix = [[0 for x in range(len(seq2) + 1)] for y in range(len(seq1) + 1)]
    del_gap_matrix = [[0 for x in range(len(seq2) + 1)] for y in range(len(seq1) + 1)]
    ins_gap_matrix = [[0 for x in range(len(seq2) + 1)] for y in range(len(seq1) + 1)]

    max_score = 0
    max_i = 0
    max_j = 0

    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            m = match if seq1[i - 1] == seq2[j - 1] else mismatch
            m += score_matrix[i - 1][j - 1]


            del_gap_matrix[i][j] = max(score_matrix[i - 1][j] + gap_open, del_gap_matrix[i - 1][j] + gap_continue)
            ins_gap_matrix[i][j] = max(score_matrix[i][j-1] + gap_open, ins_gap_matrix[i][j - 1] + gap_continue)
            score_matrix[i][j] = max(m, del_gap_matrix[i][j], ins_gap_matrix[i][j], 0)

            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]
                max_i = i
                max_j = j

    i = max_i
    j = max_j
    loc_align = 0
    sim_counter = 0

    while score_matrix[i][j] > 0:
        s = match if seq1[i - 1] == seq2[j - 1] else mismatch
        if score_matrix[i][j] == score_matrix[i - 1][j - 1] + s:
            if s == match:
                sim_counter += 1
            i -= 1
            j -=1

        elif score_matrix[i][j] == del_gap_matrix[i][j]:
            i -= 1
        else:
            j -=1

        loc_align += 1
    
    return loc_align, sim_counter


In [93]:
get_alignment("CCCCAATAAGGG", "AAAAAAAAAA", 2, -3, -5, -2)


(5, 4)

In [74]:
def read_bed(path):
    l = []
    with open(path, 'r') as file:
        for line in file.readlines()[1:]:
            data = line.split()
            l.append([int(x) for x in data[3:7]])

    return l

In [75]:
min_tir_size = 15
max_mite_size = 800
tir_search_range = 30

In [76]:
def find_mite_length(seq):

    r = False

    if len(seq) >= min_tir_size * 2 and len(seq) <= max_mite_size:
        if len(seq) > tir_search_range * 2:
            left_seq = seq[:tir_search_range]
            right_seq = seq[len(seq) - tir_search_range:]
        else:
            half = int(len(seq) / 2)
            left_seq = seq[:half]
            right_seq = seq[half:]

        right_seq = str(Seq(right_seq).reverse_complement())

        r = get_alignment(left_seq, right_seq, 2, -3, -5, -2)

    return r

In [101]:
mite_counter = 0
length_list = []
seq_list = []
for fasta_path, bed_path in zip(fasta_list, bed_list):
    print(os.path.basename(bed_path))
    seq = str(SeqIO.read(fasta_path, 'fasta').seq)

    for ele in read_bed(bed_path):

        left = seq[ele[0]:ele[1]]
        left_align = find_mite_length(left)
        right = seq[ele[2]:ele[3]]
        right_align = find_mite_length(right)

        if left_align and left_align[0] >= min_tir_size and right_align[0] >= min_tir_size:
            if left_align[1] / left_align[0] >= 0.85 and right_align[1] / right_align[0] >= 0.85:
                mite_counter += 1
                length_list.append(left_align[0])
                length_list.append(right_align[0])   
                seq_list.append((left, right))     

        # if right_length >= min_tir_size:
        #     mite_counter += 1
        #     length_list.append(right_length)

TAIR10_chr1.bed
TAIR10_chr2.bed
TAIR10_chr3.bed
TAIR10_chr4.bed
TAIR10_chr5.bed


In [102]:
print(mite_counter)

0


In [97]:
print("Mean:", stat.mean(length_list))
print("Median", stat.median(length_list))
print("STD:", stat.stdev(length_list))
print("Min:", min(length_list))
print("Max:", max(length_list))

Mean: 16
Median 15.0
STD: 2.1908902300206643
Min: 15
Max: 23


In [98]:
total_count = 0
for path in bed_list:
    total_count += len(read_bed(path))
print(total_count)

177


In [87]:
length_list

[15, 15, 15, 15, 15, 15, 19, 23, 15, 15, 17, 17, 15, 15, 15, 15]

In [89]:
seq_list[0]

('GTTGAAAGTTAAACTTGATTTTGAATCAAGTTTAATTATTGGATCAATTATCCAATAATTAATTATGGCCAAATCCAAGTTCTAGAGTTTTCTCTAGAAATATCATCATTTCCACCTCCTTAAAAGATTCTAGAAATTTTCTAGAATCATCTTCCACCTCCTTAAACATAAAAATCTAGATACTCTAATAGAATAATCTAGATAATTTGAATAATGTAATCTAGATCTTATGTAAGAACTCTCTAGACTTAGGATTAAAATATTTTAGATATTTTGTAGTTTGGAGGCTATAAATACCTCCTCCCCCTCTCAAATGTTGCAATGTTGTGAAGTTGTATTCAAGTTTAAAGCAAAGTAATAAAAGTTCTATTTCCTAAAAAACTCTCTCAAAACACTTAAACACTTTCTCCATTACCTCTAAAAGAATTTTACTCTAACA',
 'GTTGAAAGTTAAACTTGATTTTGAATCAAGTTTAATTATTGGATCAATTATCCAATAATTAATTATGGCCAAATCCAAGTTCTAGAGTTTTCTCTAGAAATATCATCATTTCCACCTCCTTAAAAGATTCTAGAAATTTTCTAGAATCATCTTCCACCTCCTTAAACATAAAAATCTAGATACTCTAATAGAATAATCTAGATAATTTGAATAATGTAATCTAGATCTTATGTAAGAACTCTCTAGACTTAGGATTAAAATATTTTAGATATTTTGTAGTTTGGAGGCTATAAATACCTCCTCCCCCTCTCAAATGTTGCAATGTTGTGAAGTTGTATTCAAGTTTAAAGCAAAGTAATAAAAGTTCTATTTCCTAAAAAACTCTCTCAAAACACTTAAACACTTTCTCCATTACCTCTAAAAGAATTTTACTCTAACA')