In [3]:
from Bio import motifs
from Bio import SeqIO
fasta_file = "/home/davide/Downloads/motivi_per_profilo.fa"
#extract all the sequences from the fasta file and create a list
istanze = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
motivo=motifs.create(istanze)

{'A': (0.9454545454545454,
  0.18181818181818182,
  0.01818181818181818,
  0.4727272727272727,
  0.07272727272727272,
  0.41818181818181815,
  0.2545454545454545,
  0.18181818181818182,
  0.34545454545454546,
  0.36363636363636365,
  0.43636363636363634,
  0.43636363636363634,
  0.03636363636363636,
  0.05454545454545454),
 'C': (0.01818181818181818,
  0.0,
  0.0,
  0.03636363636363636,
  0.36363636363636365,
  0.18181818181818182,
  0.0,
  0.5454545454545454,
  0.05454545454545454,
  0.05454545454545454,
  0.23636363636363636,
  0.36363636363636365,
  0.5818181818181818,
  0.05454545454545454),
 'G': (0.01818181818181818,
  0.8181818181818182,
  0.16363636363636364,
  0.2,
  0.21818181818181817,
  0.16363636363636364,
  0.18181818181818182,
  0.0,
  0.07272727272727272,
  0.5636363636363636,
  0.01818181818181818,
  0.0,
  0.0,
  0.0),
 'T': (0.01818181818181818,
  0.0,
  0.8181818181818182,
  0.2909090909090909,
  0.34545454545454546,
  0.23636363636363636,
  0.5636363636363636,
  0.

In [5]:
intergeniche="intergeniche_RefSeq/ortologhi/Chroococcidiopsis_sp._CCMEE_29_GCF_023558375_intergen.fasta"
for record in SeqIO.parse(intergeniche, "fasta"):
    if "lexA" in record.description:
        lexa=str(record.seq)
    if "recA" in record.description:
        reca=str(record.seq)

In [32]:
import math
def read_profile(profile_file):
    profile = []
    bases = ['A', 'C', 'G', 'T']
    
    with open(profile_file, 'r') as handle:
        # Saltare la prima riga con il titolo ">PWM"
        next(handle)
        
        for base, line in zip(bases, handle):
            frequencies = list(map(float, line.split()))
            for i, freq in enumerate(frequencies):
                if len(profile) <= i:
                    profile.append({})
                profile[i][base] = freq
    
    #print(f"Profile PWM: {profile}")
    return profile

def calculate_p_ih(profile, i, base):
    p_ih = profile[i][base]
    #print(f"p({i}, {base}): {p_ih}")
    return p_ih
def calculate_q_bases(inter_TU_file):
    q_bases = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    total_bases = 0
    with open(inter_TU_file, 'r') as inter_TU_handle:
        for record in SeqIO.parse(inter_TU_handle, 'fasta'):
            sequence = record.seq.upper()
            for base in sequence:
                if base in q_bases:
                    q_bases[base] += 1
                    total_bases += 1
    for base in q_bases:
        q_bases[base] /= total_bases
    #print(f"Base frequencies: {q_bases}")
    return q_bases


def normalization_factor(n, q_bases):
    ln_n_plus_1 = math.log(n + 1)
    ln_n_plus_4 = math.log(n + 4)
    
    sum_ln_q_b = sum(math.log(q_b) for q_b in q_bases.values())
    min_ln_q_b = math.log(min(q_bases.values()))
    
    normalization_factor = ((n + 1) / (n + 4)) * ln_n_plus_1 - ln_n_plus_4 - (1 / (n + 4)) * sum_ln_q_b - (n / (n + 4)) * min_ln_q_b
    
    #print(f"Normalization factor: {normalization_factor}")
    return normalization_factor
def information_i_all_columns(profile, q_bases, a):
    info_contents = []
    normalization_factor = a
    for column in profile:
        info_content = 0
        for base, frequency in column.items():
            if frequency > 0:
                info_content += frequency * math.log(frequency / q_bases[base])
        info_content /= normalization_factor
        info_contents.append(info_content)
    #print(f"Information contents for all columns: {info_contents}")
    return info_contents
def SM_score_Antonio(sequence, profile, q_bases, a):
    sequence_length = len(sequence)
    if sequence_length < len(profile):
        return float('-inf'), ""

    info_contents = information_i_all_columns(profile, q_bases, a)

    max_score = float('-inf')
    best_substring = ""
    
    for i in range(sequence_length - len(profile) + 1):
        substring = sequence[i:i + len(profile)]
        score = 0
        for j, base in enumerate(substring):
            p_ih = calculate_p_ih(profile, j, base)
            q_base = q_bases.get(base, 0)  # Utilizza get per gestire il caso in cui la base non sia presente
            if p_ih == 0 or q_base == 0:
                # Aggiungi un piccolo valore per evitare log(0)
                epsilon = 1e-9
                p_ih += epsilon
                q_base += epsilon
            I_i = info_contents[j]
            score += I_i * math.log(p_ih / q_base)
            #print(f"p_ih: {p_ih}, q_base: {q_base}")  # Stampa i valori per debug
        if score > max_score:
            max_score = score
            best_substring = substring
    #print(f"SM score for single sequence {sequence}: {max_score}, Best substring: {best_substring}")
    return max_score, best_substring,info_contents

In [33]:
pwm_file = "/home/davide/Downloads/profile.fasta"
profile = read_profile(pwm_file)
q_bases = calculate_q_bases(intergeniche)
a = normalization_factor(n, q_bases)
score, substring, entropyAnto = SM_score_Antonio(lexa, profile, q_bases, a)
print(f"SM score Antonio per lexA: {score}, Best substring: {substring}")

SM score Antonio per lexA: 4.103938476362669, Best substring: AGTACGAATGTTCT


In [45]:
import Bio
from math import log
motivo.background=q_bases
motivo.pseudocounts=1e-9
def relative_entropy(motivo: Bio.motifs) -> list[float]:
    """
    Calcola l'entropia relativa per ogni posizione del motivo
    :param motivo:  il motivo in formato Bio.motifs
    :return:    la lista delle entropie relative per ogni posizione del motivo
    """
    q = motivo.background  # frequenze delle basi in tutte le sequenze intergeniche
    pwm = motivo.pwm  # matrice di probabilità delle basi per ogni posizione del motivo
    n = len(motivo.instances) # numero di sequenze con cui è stato costruito il motivo
    a = (n + 1) / (n + 4) * log(n + 1) - log(n + 4) - 1 / (n + 4) * sum(log(q[b]) for b in "ACGT") - n / (
            n + 4) * log(min(q.values()))
    entropy = []
    for i in range(motivo.length):
        entropy.append(sum(pwm[b, i] * log(pwm[b, i] / q[b]) for b in "ACGT") / a)
    return entropy,a


def sm(motivo: Bio.motifs, seq: str, rel_entropy: list, pwm) -> tuple:
    """
    Calcola lo score SM di una sequenza rispetto ad un motivo :param motivo:  il motivo in formato Bio.motifs :param
    seq:     la sequenza su cui calcolare lo score :return:    lo score migliore nella sequenza intergenica rispetto
    al motivo, la sottosequenza a cui corrisponde e la usa poszione rispetto a inizio trascrizione
    """
    q = motivo.background  # frequenze delle basi in tutte le sequenze intergeniche
    max = -float("inf")
    max_i = 0
    for i in range(len(seq) - motivo.length + 1):
        h = seq[i:i + motivo.length]  # sottosequenza di lunghezza del motivo(l-mero della sequenza intergenica)
        score = sum(rel_entropy[i] * log(pwm[h[i], i] / q[h[i]]) for i in range(len(h)))  # score della sottosequenza
        if score > max:
            max = score
            max_i = i
    return max, seq[max_i:max_i + motivo.length], max_i - len(seq)
entropy,normDavide=relative_entropy(motivo)
pwm=motivo.pwm
score, substring, pos = sm(motivo, lexa, entropy, pwm)
print(f"SM score Davide per lexA: {score}, Best substring: {substring}, Posizione: {pos}")

SM score Davide per lexA: 4.104102571906228, Best substring: AGTACGAATGTTCT, Posizione: -47


In [46]:
entropy

[0.7334545025023469,
 0.7867277364393388,
 0.583125167673809,
 0.11620315376881432,
 0.13396677651334132,
 0.026133644248409554,
 0.24016523156243205,
 0.3252702116718817,
 0.16982951102066912,
 0.39120604500341133,
 0.13612429037340432,
 0.23054632405544337,
 0.4840198927302151,
 0.642311522071406]

In [47]:
entropyAnto

[0.7334651892900362,
 0.7867535051210478,
 0.5831076618197489,
 0.11614501530222203,
 0.1339872412551263,
 0.026141485470669183,
 0.24007729904352462,
 0.3253118865055721,
 0.1699158013862432,
 0.39110414025003587,
 0.13618183312074197,
 0.23054141285195787,
 0.4839433035928533,
 0.6423323484983088]