Autorius: 4 kurso 3 grupės studentas Lukas Orliukas

Nuorodą į darbą: https://colab.research.google.com/drive/1Kvo6e2QzkD8oGBK3Un5TId7Zo-oPE-OA?usp=sharing

In [75]:
!pip install biopython



In [76]:
import os
import sys

from urllib.request import urlretrieve

import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable

In [77]:
# Žodynai, kuriuose saugomos poros
original_sequence_start_stop_couples = {}
reversed_sequence_start_stop_couples = {}

# Failų nuskaitymas
def read_fasta(file_paths):
    sequences = {}
    for file_path in file_paths:
        current_sequence = ""
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line.startswith('>'):
                    if current_sequence:
                        sequences[header] = current_sequence
                    header = line[1:]
                    current_sequence = ""
                else:
                    current_sequence += line
            if header and current_sequence:
                sequences[header] = current_sequence
    return sequences

# Funkcija, kuri iš sekos sudaro jos reverse komplementą
def reverse_complement(dna_sequence):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    return ''.join(complement[base] for base in reversed(dna_sequence))

# Funkcija, kuri suranda start ir stop poras sekoje
def find_start_stop_couples(dna_sequence, iteration, file_key, pairs_dict):
    if file_key not in pairs_dict:
        pairs_dict[file_key] = []

    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]
    sequence_length = len(dna_sequence)

    # Iteracija yra skaitymo rėmelis, kuris naudojamas ėjimui per seką
    i = iteration

    while i < sequence_length - 2:
        codon = dna_sequence[i:i+3]

        if codon == start_codon:
            stop_found = False
            for stop_codon in stop_codons:
                j = i + 3
                while j < sequence_length - 2:
                    potential_stop_codon = dna_sequence[j:j+3]
                    if potential_stop_codon in stop_codons:
                        pairs_dict[file_key].append((i, j + 3))
                        stop_found = True
                        break
                    j += 3
                if stop_found:
                    break

        i += 3

file_paths = ['bacterial1.fasta', 'bacterial2.fasta', 'bacterial3.fasta', 'bacterial4.fasta', 'mamalian1.fasta', 'mamalian2.fasta', 'mamalian3.fasta', 'mamalian4.fasta']

for file_path in file_paths:
    sequences = read_fasta([file_path])

    for header, sequence in sequences.items():
        original_sequence_file_key = f"all_start_stop_couples_{file_path}"

        # Ieškome porų sekoje 3 kartus, su skirtingais skaitymo rėmeliais.
        for iteration in range(3):
            find_start_stop_couples(sequence, iteration, original_sequence_file_key, original_sequence_start_stop_couples)

        reversed_sequence = reverse_complement(sequence)
        reversed_sequence_file_key = f"all_start_stop_couples_reversed_{file_path}"

        # Ieškome porų sekos reverse komplemente 3 kartus, taip pat naudodami skirtingus rėmelius
        for iteration in range(3):
            find_start_stop_couples(reversed_sequence, iteration, reversed_sequence_file_key, reversed_sequence_start_stop_couples)


In [78]:
# Spausdinimai naudoti tikrintis gautus rezultatus
# print(original_sequence_start_stop_couples['all_start_stop_couples_bacterial1.fasta'])
# print(reversed_sequence_start_stop_couples['all_start_stop_couples_reversed_bacterial1.fasta'])

In [79]:
# Funkcija, kuri kiekvienam stop kodonui palieka pora su toliausiai nuo jo esančiu start kodonu.
def filter_pairs(pairs):
    stop_codon_positions = {}
    filtered_pairs = []

    for start, stop in pairs:
        if stop not in stop_codon_positions:
            stop_codon_positions[stop] = (start, stop)
        else:
            existing_start, existing_stop = stop_codon_positions[stop]
            if stop - start > existing_stop - existing_start:
                stop_codon_positions[stop] = (start, stop)

    filtered_pairs = list(stop_codon_positions.values())

    return filtered_pairs

In [80]:
# Surenkamos poros sekose (visų failų)
for file_key, pairs in original_sequence_start_stop_couples.items():
    original_sequence_start_stop_couples[file_key] = filter_pairs(pairs)

# Surenkamos poros sekų reversed komplementuose (visų failų)
for file_key, pairs in reversed_sequence_start_stop_couples.items():
    reversed_sequence_start_stop_couples[file_key] = filter_pairs(pairs)

In [81]:
# Spausdinimai naudoti tikrintis gautus rezultatus
# print(original_sequence_start_stop_couples['all_start_stop_couples_bacterial1.fasta'])
# print(reversed_sequence_start_stop_couples['all_start_stop_couples_reversed_bacterial1.fasta'])

In [82]:
# Funkcija filtuojanti sekas pagal ilgį
def filter_sequences_by_length(filtered_pairs, sequence_length_threshold=300):
    filtered_sequences = []

    for start, stop in filtered_pairs:
        if (stop - start) > sequence_length_threshold:
            filtered_sequences.append((start, stop))

    return filtered_sequences

In [83]:
# Filtruojamas ilgis sekose (visų failų)
for file_key, pairs in original_sequence_start_stop_couples.items():
    original_sequence_start_stop_couples[file_key] = filter_sequences_by_length(pairs)

# Filtruojamas ilgis sekų reversed komplementuose (visų failų)
for file_key, pairs in reversed_sequence_start_stop_couples.items():
    reversed_sequence_start_stop_couples[file_key] = filter_sequences_by_length(pairs)

In [84]:
# Spausdinimai naudoti tikrintis gautus rezultatus
# print(original_sequence_start_stop_couples['all_start_stop_couples_bacterial1.fasta'])
# print(reversed_sequence_start_stop_couples['all_start_stop_couples_reversed_bacterial1.fasta'])

In [85]:
import os

file_paths = ['bacterial1.fasta', 'bacterial2.fasta', 'bacterial3.fasta', 'bacterial4.fasta', 'mamalian1.fasta', 'mamalian2.fasta', 'mamalian3.fasta', 'mamalian4.fasta']

sequences = {}
reversed_sequences = {}

# Nuskaitomos sekos iš pradinių failų, jos išsaugomos ir taip pat jų reversed komplementas naudojantis biopython biblioteka
for file_path in file_paths:
    base_name = os.path.basename(file_path)
    with open(file_path, 'r') as file:
        for record in SeqIO.parse(file, 'fasta'):
            reversed_sequence = str(record.seq.reverse_complement())
            reversed_sequences[base_name] = reversed_sequence
            sequence = str(record.seq)
            sequences[base_name] = sequence

In [86]:
# Funkcija išrenkanti posekius pagal start stop kodonų indeksus atlikus pirmus 3 užduoties punktus.
def extract_subsequences(start_stop_pairs, sequence):
    subsequences = []
    for start, stop in start_stop_pairs:
        subsequence = sequence[start:stop]
        subsequences.append(subsequence)
    return subsequences

all_subsequences = {}

for file_path in file_paths:
    original_sequence_file_key = f"all_start_stop_couples_{file_path}"
    reversed_sequence_file_key = f"all_start_stop_couples_reversed_{file_path}"
    start_stop_pairs = original_sequence_start_stop_couples.get(original_sequence_file_key, [])
    start_stop_pairs_reversed = reversed_sequence_start_stop_couples.get(reversed_sequence_file_key, [])
    sequence = sequences[file_path]
    reversed_sequence = reversed_sequences[file_path]

    original_sequence_subsequences = extract_subsequences(start_stop_pairs, sequence)
    reversed_sequence_subsequences = extract_subsequences(start_stop_pairs_reversed, reversed_sequence)

    # Saugome tos pačios sekos ir jos reversed komplemento posekius juos sudedami ten pat
    all_subsequences[file_path] = original_sequence_subsequences  + reversed_sequence_subsequences

# Spausdinimas posekių tikrinimui
for file_path, subsequences in all_subsequences.items():
    print(f"File: {file_path}")
    for i, subsequence in enumerate(subsequences):
        print(f"Subsequence {i + 1}: {subsequence}")


File: bacterial1.fasta
Subsequence 1: ATGCAAACACAAAACGGTGGCAGACCCACAATTTTACCTAAGATGTATGAAGAACCGCTATTTAGTCAAATCATTGATAAAATTGAATCAGGTTGCAATGACAGAGAAATCTACACCAGTTTACATTGTTCTGCTAAAACTTTTAGAAAGTGGCGAGATGACAATATAAAGGCGTATGACGAAGCTAAAAGCATTGCTAGGGGAAATCTATTAGAACTAGCTGAAAGTGCCTTAGCGAGTAAACTGACAGTCAGAACGCTAAAGGAAACAGAAACAATCTATGACGCTAACGGAAACGTTGAAAAAGTAAAAGTTAAAGAAAAAGAGCTGGATAAAGATAGCTTGGTGGCGATGATGGTTGCTAAGGCTGGAAACCCTGAACTTTATAACCCTACTGAATGGCGTAGATTGCAACAAGAAGAATCAAGCGCTAATGACCTTAAAGCTAAGATTGAAGAACTTGATGACTATAAACTAAGTAAGTACGAAACACCAAAAATTGAAGTGCCGAAAGGGTTTGAATAA
Subsequence 2: ATGTATTATTTAAATAAAATGTTGGAATACAACAAAGAAAACGGCATTATTATTAACAAATACATTCGCAAGACTATTCAGAAGCAAATACGTATTCATAATAAGTATATTTATCGCTATGACCGTGTTACACAAGCTATTGAATGGATTGAAGATAATTTCTATTTAACAACTGGTAACCTAATGAAAATCAAGCTACACCCTGTTCAAAAATGGTGGTACGAGTTAATGCTTGGCTATGATATGGTTGATGAAAAAGGTATTCAGGTAAACTTAGTTAATGAAATTTTTCTTAATCTAGGACGTGGTTCTGGTAAGTCAAGTTTAATGGCCACGCGCGTGCTTAACTGGATGATTTTAGGCGGACAATATGGTGGAGAGAGTTTAGTTATTGCATACGATAATACACAGGCTAGACACG

In [87]:
proteins = {}

# Posekius vertimas baltymų sekomis
for file_path, subsequences in all_subsequences.items():
    protein_sequences = []
    for subsequence in subsequences:
        protein_seq = Seq(subsequence).translate(to_stop = True) # Naudojamas biopython metodas
        protein_sequences.append(str(protein_seq))

    proteins[file_path] = protein_sequences

# Baltymų sekų spausdinimas tikrinimui
for file_path, protein_sequences in proteins.items():
    print(f"File: {file_path}")
    for i, protein_sequence in enumerate(protein_sequences):
        print(f"Protein Sequence {i + 1}: {protein_sequence}")

File: bacterial1.fasta
Protein Sequence 1: MQTQNGGRPTILPKMYEEPLFSQIIDKIESGCNDREIYTSLHCSAKTFRKWRDDNIKAYDEAKSIARGNLLELAESALASKLTVRTLKETETIYDANGNVEKVKVKEKELDKDSLVAMMVAKAGNPELYNPTEWRRLQQEESSANDLKAKIEELDDYKLSKYETPKIEVPKGFE
Protein Sequence 2: MYYLNKMLEYNKENGIIINKYIRKTIQKQIRIHNKYIYRYDRVTQAIEWIEDNFYLTTGNLMKIKLHPVQKWWYELMLGYDMVDEKGIQVNLVNEIFLNLGRGSGKSSLMATRVLNWMILGGQYGGESLVIAYDNTQARHVFDQVRNQTEASDTLRVYNENKIFKSTKQGLEFTAFKTTFKKQTNDTLRAQGGNSSLNIFDEVHTYGEDITESVNKGSRQKQDNWQSIYITSGGLKRDGLYDKLVERFKSEEEFYNDRSFGLLYMLENHEQVKDKKNWTMALPLIGNVPKWSGVVEEYELAQGDPALQNKFLAFNMGLPMQDTAYYFTPQDTKLTDFNLSVFNKNRTYVGIDLSLIGDLTAVSFVCELEGKTYSHTLTFSVRSQYEQLDTEQQELWTEFVDRGELILLDTEYINVNDLIPYINDFRTKTGCRLRKIGYDPARYEILKGLIERYFFDKDGDNQRAIRQGFSMNDYIKLLKSKLVENKLIHNQKVMQWALNNTAVKIGQSGDYMYTKKLEKDKIDPTVALTMALEMAVSDEV
Protein Sequence 3: MNKPDLIEKQNRLAELKENNVSLKSQINGFEVKNAIEDLPKVQELEKTLSENSIEIIKIENELNAQEEKQKGKAKMTNFIESQNAVTEFFDVLKKNSAKSEIEDAWNAKLAENGVTITDTTFQLPRKLVESINTALLNTNPVFKVFHVTNVGALLVSRSFDSANEAQVHKDGQTKTEQAATLTIDTLEPVMVYKLQSLAER

In [88]:
from collections import Counter
# Funkcija skaičiuojanti kodonų dažnį (ne tiesiog kodonų skaičius, bet procentinė dalis)
def calculate_codon_frequency(protein_sequences):

    codon_frequency = Counter()
    total_sequence_length = 0

    for protein_sequence in protein_sequences:
        codons = [amino_acid for amino_acid in protein_sequence]
        codon_frequency.update(codons)
        total_sequence_length += len(codons)

    # Paverčiama į procentus
    for codon, count in codon_frequency.items():
        frequency = (count / total_sequence_length) * 100
        codon_frequency[codon] = round(frequency, 2)

    # Rikiavimas, kad visų failų kodonų sąrašas būtų vienodos tvarkos
    sorted_codon_frequency = {k: v for k, v in sorted(codon_frequency.items())}

    return sorted_codon_frequency

codon_frequencies_dict = {}
# Skaičiavimas, spausdinimas atsakymų tikrinimui
for file_path, protein_sequences in proteins.items():
    codon_frequency = calculate_codon_frequency(protein_sequences)
    codon_frequencies_dict[file_path] = codon_frequency
    print(f"File: {file_path}")
    for codon, frequency in codon_frequency.items():
        print(f"Codon: {codon}, Frequency: {frequency:.2f}%")

File: bacterial1.fasta
Codon: A, Frequency: 6.31%
Codon: C, Frequency: 0.61%
Codon: D, Frequency: 5.58%
Codon: E, Frequency: 7.00%
Codon: F, Frequency: 4.57%
Codon: G, Frequency: 6.42%
Codon: H, Frequency: 1.08%
Codon: I, Frequency: 6.90%
Codon: K, Frequency: 8.68%
Codon: L, Frequency: 8.51%
Codon: M, Frequency: 2.41%
Codon: N, Frequency: 6.66%
Codon: P, Frequency: 2.50%
Codon: Q, Frequency: 3.66%
Codon: R, Frequency: 3.22%
Codon: S, Frequency: 7.24%
Codon: T, Frequency: 6.87%
Codon: V, Frequency: 6.32%
Codon: W, Frequency: 1.54%
Codon: Y, Frequency: 3.91%
File: bacterial2.fasta
Codon: A, Frequency: 6.04%
Codon: C, Frequency: 1.31%
Codon: D, Frequency: 6.15%
Codon: E, Frequency: 6.35%
Codon: F, Frequency: 4.85%
Codon: G, Frequency: 6.30%
Codon: H, Frequency: 2.39%
Codon: I, Frequency: 6.96%
Codon: K, Frequency: 6.16%
Codon: L, Frequency: 8.67%
Codon: M, Frequency: 3.07%
Codon: N, Frequency: 4.90%
Codon: P, Frequency: 4.12%
Codon: Q, Frequency: 3.75%
Codon: R, Frequency: 5.65%
Codon: S,

In [89]:
from collections import Counter
import itertools
# Funkcija skaičiuojanti dikodonų dažnius (taip pat procentine dalimi)
def calculate_dicodon_frequency(protein_sequences):
    # Visų įmanomų dikodonų sąrašas
    dicodons = [''.join(d) for d in itertools.product('ACDEFGHIKLMNPQRSTVWY', repeat=2)]

    dicodon_counts = {dicodon: 0 for dicodon in dicodons}

    for protein_sequence in protein_sequences:
        # Skaidome dikodonais
        dicodons_in_sequence = [protein_sequence[i:i + 2] for i in range(len(protein_sequence) - 1)]
        dicodon_counts = {dicodon: count + dicodons_in_sequence.count(dicodon) for dicodon, count in dicodon_counts.items()}

    total_dicodon_count = sum(dicodon_counts.values())

    # Suskaičiuojami dažniai procentinėmis dalimis
    dicodon_frequencies = {dicodon: (count / total_dicodon_count) * 100 for dicodon, count in dicodon_counts.items()}

    return dicodon_frequencies

dicodon_frequencies_dict = {}
# Skaičiavimas, spausdinimas tikrinimui
for file_path, protein_sequences in proteins.items():
    dicodon_frequencies = calculate_dicodon_frequency(protein_sequences)
    dicodon_frequencies_dict[file_path] = dicodon_frequencies
    print(f"File: {file_path}")
    for dicodon, frequency in dicodon_frequencies.items():
        print(f"{dicodon}: {frequency:.5f}%")

File: bacterial1.fasta
AA: 0.10007%
AC: 0.02859%
AD: 0.27162%
AE: 0.40029%
AF: 0.34310%
AG: 0.50036%
AH: 0.10007%
AI: 0.42888%
AK: 0.55754%
AL: 0.60043%
AM: 0.24303%
AN: 0.54325%
AP: 0.11437%
AQ: 0.21444%
AR: 0.22873%
AS: 0.35740%
AT: 0.41458%
AV: 0.50036%
AW: 0.15726%
AY: 0.21444%
CA: 0.02859%
CC: 0.00000%
CD: 0.01430%
CE: 0.01430%
CF: 0.04289%
CG: 0.07148%
CH: 0.00000%
CI: 0.07148%
CK: 0.02859%
CL: 0.02859%
CM: 0.01430%
CN: 0.05718%
CP: 0.02859%
CQ: 0.04289%
CR: 0.02859%
CS: 0.05718%
CT: 0.02859%
CV: 0.04289%
CW: 0.00000%
CY: 0.01430%
DA: 0.22873%
DC: 0.04289%
DD: 0.28592%
DE: 0.35740%
DF: 0.34310%
DG: 0.28592%
DH: 0.01430%
DI: 0.48606%
DK: 0.50036%
DL: 0.52895%
DM: 0.11437%
DN: 0.37169%
DP: 0.20014%
DQ: 0.07148%
DR: 0.21444%
DS: 0.31451%
DT: 0.52895%
DV: 0.38599%
DW: 0.05718%
DY: 0.25733%
EA: 0.34310%
EC: 0.01430%
ED: 0.35740%
EE: 0.48606%
EF: 0.32881%
EG: 0.18585%
EH: 0.11437%
EI: 0.57184%
EK: 0.68620%
EL: 0.85776%
EM: 0.14296%
EN: 0.44317%
EP: 0.17155%
EQ: 0.38599%
ER: 0.24303%
ES

In [90]:
import numpy as np
from scipy.spatial import distance
# Funkcija skaičiuojanti atstumų matrica kodonams (naudojamas Euklidinis atstumas)
def calculate_distance_between_files(codon_frequencies):
    file_keys = list(codon_frequencies.keys())
    num_files = len(file_keys)
    distance_matrix = np.zeros((num_files, num_files))

    for i in range(num_files):
        for j in range(i+1, num_files):
            file1 = file_keys[i]
            file2 = file_keys[j]
            freq1 = list(codon_frequencies[file1].values())
            freq2 = list(codon_frequencies[file2].values())
            dist = distance.euclidean(freq1, freq2)
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    return distance_matrix, file_keys

distance_matrix, file_keys = calculate_distance_between_files(codon_frequencies_dict)

# Matricos spausdinimas
print("Distance Matrix:")
print(distance_matrix)

# Raktai
print("File Keys:")
print(file_keys)


Distance Matrix:
[[ 0.          4.92090439  2.93690313  2.8456458   5.84854683  8.66038683
   7.17469163 11.43079612]
 [ 4.92090439  0.          4.25246987  4.54541527  4.97242396  5.33597226
   6.90821974  8.48482174]
 [ 2.93690313  4.25246987  0.          2.5308299   5.21728857  7.77924804
   7.97043286 10.10170778]
 [ 2.8456458   4.54541527  2.5308299   0.          5.45497938  8.11541743
   7.40226317 10.76296428]
 [ 5.84854683  4.97242396  5.21728857  5.45497938  0.          5.87630836
   7.96313381  8.11650171]
 [ 8.66038683  5.33597226  7.77924804  8.11541743  5.87630836  0.
  10.40577244  4.24304136]
 [ 7.17469163  6.90821974  7.97043286  7.40226317  7.96313381 10.40577244
   0.         13.23398277]
 [11.43079612  8.48482174 10.10170778 10.76296428  8.11650171  4.24304136
  13.23398277  0.        ]]
File Keys:
['bacterial1.fasta', 'bacterial2.fasta', 'bacterial3.fasta', 'bacterial4.fasta', 'mamalian1.fasta', 'mamalian2.fasta', 'mamalian3.fasta', 'mamalian4.fasta']


# Atstumo matrica kodonams

8

Lactococcus_phage    0.          4.92090439  2.93690313  2.8456458   5.84854683  8.66038683 7.17469163 11.43079612

Escherichia_phage    4.92090439  0.          4.25246987  4.54541527  4.97242396  5.33597226
   6.90821974  8.48482174

Streptococcus_phage 2.93690313  4.25246987  0.          2.5308299   5.21728857  7.77924804
   7.97043286 10.10170778

Cellulophaga_phage 2.8456458   4.54541527  2.5308299   0.          5.45497938  8.11541743
   7.40226317 10.76296428

Coronavirus 5.84854683  4.97242396  5.21728857  5.45497938  0.          5.87630836
   7.96313381  8.11650171

Adenovirus 8.66038683  5.33597226  7.77924804  8.11541743  5.87630836  0.
  10.40577244  4.24304136

Variola_virus 7.17469163  6.90821974  7.97043286  7.40226317  7.96313381 10.40577244 0.         13.23398277

Herpesvirus 11.43079612  8.48482174 10.10170778 10.76296428  8.11650171  4.24304136
  13.23398277  0.        

In [91]:
import numpy as np
from scipy.spatial import distance
# Funkcija skaičiuojanti atstumų matrica dikodonams (naudojamas Euklidinis atstumas)
def calculate_distance_between_files(dicodon_frequencies_dict):
    file_keys = list(dicodon_frequencies_dict.keys())
    num_files = len(file_keys)
    distance_matrix = np.zeros((num_files, num_files))

    for i in range(num_files):
        for j in range(i+1, num_files):
            file1 = file_keys[i]
            file2 = file_keys[j]
            freq1 = list(dicodon_frequencies_dict[file1].values())
            freq2 = list(dicodon_frequencies_dict[file2].values())
            dist = distance.euclidean(freq1, freq2)
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist

    return distance_matrix, file_keys

In [92]:
distance_matrix, file_keys = calculate_distance_between_files(dicodon_frequencies_dict)

# Matricos spausdinimas
print("Distance Matrix:")
print(distance_matrix)

# Raktai
print("File Keys:")
print(file_keys)

Distance Matrix:
[[0.         2.41193114 1.9371851  2.11447114 2.68118024 3.4512835
  3.19798912 4.5939529 ]
 [2.41193114 0.         2.14133768 2.30874157 2.26452154 2.37519599
  2.99064931 3.62671309]
 [1.9371851  2.14133768 0.         1.93519722 2.39024947 3.17583342
  3.37283713 4.18184482]
 [2.11447114 2.30874157 1.93519722 0.         2.5439802  3.22712992
  3.29359876 4.3577025 ]
 [2.68118024 2.26452154 2.39024947 2.5439802  0.         2.44318591
  3.2290708  3.58044054]
 [3.4512835  2.37519599 3.17583342 3.22712992 2.44318591 0.
  3.99337875 2.31188948]
 [3.19798912 2.99064931 3.37283713 3.29359876 3.2290708  3.99337875
  0.         5.06741827]
 [4.5939529  3.62671309 4.18184482 4.3577025  3.58044054 2.31188948
  5.06741827 0.        ]]
File Keys:
['bacterial1.fasta', 'bacterial2.fasta', 'bacterial3.fasta', 'bacterial4.fasta', 'mamalian1.fasta', 'mamalian2.fasta', 'mamalian3.fasta', 'mamalian4.fasta']


# Atstumo matrica dikodonams

8

Lactococcus_phage 0.         2.41193114 1.9371851  2.11447114 2.68118024 3.4512835
  3.19798912 4.5939529

Escherichia_phage 2.41193114 0.         2.14133768 2.30874157 2.26452154 2.37519599
  2.99064931 3.62671309   

Streptococcus_phage 1.9371851  2.14133768 0.         1.93519722 2.39024947 3.17583342
  3.37283713 4.18184482

Cellulophaga_phage 2.11447114 2.30874157 1.93519722 0.         2.5439802  3.22712992
  3.29359876 4.3577025

Coronavirus 2.68118024 2.26452154 2.39024947 2.5439802  0.         2.44318591
  3.2290708  3.58044054

Adenovirus 3.4512835  2.37519599 3.17583342 3.22712992 2.44318591 0.
  3.99337875 2.31188948

Variola_virus 3.19798912 2.99064931 3.37283713 3.29359876 3.2290708  3.99337875 0.         5.06741827

Herpesvirus 4.5939529  3.62671309 4.18184482 4.3577025  3.58044054 2.31188948
  5.06741827 0.            

In [93]:
import numpy as np
from scipy.spatial import distance

def find_biggest_codon_differences(codon_frequencies_dict, file_keys):
    bacterial_files = [file_key for file_key in file_keys if 'bacterial' in file_key]
    mammalian_files = [file_key for file_key in file_keys if 'mammalian' in file_key]

    bacterial_distances = []
    mammalian_distances = []

    for i, file1 in enumerate(bacterial_files):
        for j, file2 in enumerate(mammalian_files):
            freq1 = list(codon_frequencies_dict[file1].values())
            freq2 = list(codon_frequencies_dict[file2].values())
            dist = distance.euclidean(freq1, freq2)
            bacterial_distances.append((file1, file2, dist))

    bacterial_distances.sort(key=lambda x: x[2], reverse=True)
    biggest_differences = bacterial_distances[:5]  # Adjust the number of differences to retrieve

    return biggest_differences


# Find the biggest codon differences between bacterial and mammalian files
biggest_differences = find_biggest_codon_differences(codon_frequencies_dict, file_keys)

# Print the biggest differences
print("Biggest Codon Differences:")
for file1, file2, dist in biggest_differences:
    print(f"{file1} vs. {file2}: {dist:.2f}")

Biggest Codon Differences:
