In [151]:
import pandas as pd


def revcomp(seq):
    """Compute the reverse complement of a sequence
    ::param seq: the sequence
    ::return: the reverse complement of the sequence
    """
    seq=seq.upper()
    seq=seq[::-1]
    diz={'A':'T','T':'A','C':'G','G':'C'}
    seq="".join([diz[x] for x in seq])
    return seq
def get_score(s1,lat=4):
    """Compute the score of the motif using alignment between sequence and reverse complement of the sequence to find palindromes and use gaps
    between the two sequences
    ::param s1: the motif
    ::return: alignment, number of matches, gap, info, score
    """
    s1=s1.upper()
    s2=revcomp(s1)
    score=0
    info=""
    if s1[lat:lat+3]=="AGT":
        info+="1 pt: per AGT, "
        score+=1
    if s1[-lat-3:-lat]=="ACT":
        info+="1 pt: per ACT, "
        score+=1
    elif s1[-lat-2:-lat+1]=="CTA" or s1[-lat-2:]=="CTA":
        info+="1 pt: per CTA, "
        score+=1
    pali=0
    al=""
    for i,j in zip(s1,s2):
        if i==j:
            pali+=1
            al+=i
        else:
            al+="-"
    info+=f"0.2*{pali}/2={round(0.2*pali/2,2)} pts: {pali}  basi palindrome, "
    score+=0.2*pali/2
    #score the gap between s1[7] and s1[-7]
    gap=len(s1[lat+3:-lat-3])
    #if the gap is  8 give max score else igive less considering the distance from 8 for example if it is 7 or 9 give hte same score but less
    if gap==8:
        info+="1 pt: per 8 bp gap"
        score+=1
    elif gap==7 or gap==9:
        info+=f"0.5 pt: per {gap} bp gap"
        score+=0.5
    else:
        info+=f"0.2 pt: per {gap} bp gap"
        score+=0.2
    return al,pali,gap,info,score

df_ann=pd.read_csv("/home/davide/Desktop/CCMEE29Annotazioni.csv")
#rename column sequence_name to locus_tag
df_ann=df_ann.rename(columns={"sequence_name":"locus_tag"})


In [170]:

file="/home/davide/PycharmProjects/TESI2/intergeniche_RefSeq/ortologhi/Chroococcidiopsis_sp._CCMEE_29_GCF_023558375_intergen.fasta"
import regex as re
i=1
# Define the motifs
motivi= [
    f".{{{i}}}AG[ATCG].{{4,11}}ACT.{{{i}}}",
f".{{{i}}}AGT.{{4,11}}A[AT]T.{{{i}}}",
f".{{{i}}}AGT.{{4,11}}AC[AT].{{{i}}}",
f".{{{i}}}AGT.{{4,11}}[TG]CT.{{{i}}}",
f".{{{i}}}TGT.{{4,11}}ACT.{{{i}}}"
]
from Bio import SeqIO
#parse the fasta file
diz={}
infos={}
for record in SeqIO.parse(file, "fasta"):
    seq=str(record.seq)
    #use the finditer method to find all the matches of the regular expression
    for m in motivi:
        for match in re.finditer(m, str(seq), overlapped=True):
            x=match.group()
            s,e=match.start(),match.end()
            scores=get_score(x,i)
            x=x[:i].lower()+x[i:-i]+x[-i:].lower()
            # print(x,s,e,seq[s],seq[e-1])
            #store the match in a dictionary
            pos=s-len(seq)+i
            if record.id in diz:
                diz[record.id].append((x,pos,round(scores[-1],2),scores[-2]))
                infos[record.id].append((scores[-2],))
            else:
                diz[record.id]=[(x,pos,round(scores[-1],2),scores[-2])]
                infos[record.id]=[(scores[-2],)]
df=pd.DataFrame(diz.items(),columns=["locus_tag","motivi"])
#remove duplicates from the list of motifs
df["motivi"]=df["motivi"].apply(lambda x:list(set(x)))
df["motivi"]=df["motivi"].apply(lambda x:sorted(x,key=lambda y:y[1],reverse=True))
#extract the fourth element for every tuple in the list motivi annd make new column info_score
df["info_score_variabile"]=df["motivi"].apply(lambda x:[(y[3],) for y in x])
#delete the fourth element from the list motivi
df["motivi"]=df["motivi"].apply(lambda x:[y[:3] for y in x])
df["max_score"]=df["motivi"].apply(lambda x:x[0][1])
df=df.sort_values("max_score",ascending=False)
df

Unnamed: 0,locus_tag,motivi,info_score_variabile,max_score
4073,LAU37_RS29025,"[(cAGGAGGTACTg, -11, 2.0), (cAGCTTTTCTAATACTa,...","[(1 pt: per ACT, 0.2*8/2=0.8 pts: 8 basi pali...",-11
2498,LAU37_RS07130,"[(gAGATAAGACTc, -11, 1.8), (tAGGAGATAAGACTc, -...","[(1 pt: per ACT, 0.2*6/2=0.6 pts: 6 basi pali...",-11
2136,LAU37_RS02330,"[(aAGCTATCACTg, -11, 1.8), (aTGTCCACCAGTGCCACT...","[(1 pt: per ACT, 0.2*6/2=0.6 pts: 6 basi pali...",-11
3020,LAU37_RS14090,"[(aAGTTGAAATTt, -11, 2.0), (cAGTTAAGTTGAAATTt,...","[(1 pt: per AGT, 0.2*8/2=0.8 pts: 8 basi pali...",-11
1466,LAU37_RS20500,"[(aAGTTTCCAATt, -11, 1.8), (cAGTTAAGTTTCCAATt,...","[(1 pt: per AGT, 0.2*6/2=0.6 pts: 6 basi pali...",-11
...,...,...,...,...
563,LAU37_RS08560,"[(aAGTTGTTTTTGCTg, -271, 1.9)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",-271
1264,LAU37_RS17985,"[(gAGTAGCCAAACAACAa, -276, 2.1), (cAGCAGTGAACT...","[(1 pt: per AGT, 0.2*6/2=0.6 pts: 6 basi pali...",-276
4217,LAU37_RS27705,"[(cAGTAGTTAACTg, -280, 3.0)]","[(1 pt: per AGT, 1 pt: per ACT, 0.2*8/2=0.8 pt...",-280
3491,LAU37_RS20835,"[(tAGGGTTGGATAACTc, -293, 2.6)]","[(1 pt: per ACT, 0.2*6/2=0.6 pts: 6 basi pali...",-293


In [171]:
#merge df with df_ann on the locus_tag and index
df_score4=pd.read_csv("/home/davide/PycharmProjects/TESI2/scorerefined4.txt",sep="\t",header=None,skiprows=1)
df_score4.columns=["locus_tag","score","score_refined","motivo","posizione"]
df=pd.merge(df,df_score4,on="locus_tag",how="outer")
df=pd.merge(df,df_ann,on="locus_tag")
#dropna for posizione column
df.dropna(subset=["posizione"],inplace=True)    

In [172]:
dfeviPCC7120=pd.read_csv("/home/davide/Desktop/eviPCC7120.csv")
dfeviPCC6803=pd.read_csv("/home/davide/Desktop/eviPCC6803.csv")
dfeviPCC7120["species"]="PCC7120"
dfeviPCC6803["species"]="PCC6803"
dfevi=pd.concat([dfeviPCC7120,dfeviPCC6803])
#rename the column score to info_blastp
dfevi=dfevi.rename(columns={"score":"info_blastp"})
df=pd.merge(df,dfevi,left_on="protein_id",right_on="qseqid",how="left")
df.drop(columns=["max_score","qseqid"],inplace=True)
#sort the dataframe by score
df=df.sort_values("score_refined",ascending=False)
df.set_index("locus_tag",inplace=True)
df
# df.to_excel("/home/davide/Desktop/CCMEE29MotiviEvi.xlsx",index=False)

Unnamed: 0_level_0,motivi,info_score_variabile,score,score_refined,motivo,posizione,protein_id,product,gene,GO_process,...,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways,sseqid,old_locus_tag,info_blastp,Reference,Note,species
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LAU37_RS16010,"[(tAGTCTAGACTc, -52, 3.2), (tAGTATATCTGCACTa, ...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*10/2=1.0 p...",10.516146,21.032292,TAGTATATCTGCACTA,-67.0,WP_250121493.1,recombinase RecA,recA,"['GO:0006281 - DNA repair [Evidence IEA]', 'GO...",...,"['GO:0003677 - DNA binding [Evidence IEA]', 'G...","['GO:0009432 - SOS response', 'GO:0006310 - DN...",ko03400 DNA repair and recombination proteins,03440 Homologous recombination,"WP_010997423.1,gene:recA(all3272),locus_tag:PC...",all3272,identity:88.889%|length:342|evalue:0.00e+00,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
LAU37_RS14470,"[(tAGTACGAATGTTCTa, -47, 4.0), (cAGTGGGGGCATTt...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*10/2=1.0 p...",10.210154,20.212213,TAGTACGAATGTTCTA,-48.0,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,...,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",,BAL28941.1 SOS function regulatory protein Lex...,sll1626,identity:50.0%|length:202|evalue:1.76e-71,"Kamei, A. et al., 2001",Genes whose regulation by LexA has been valida...,PCC6803
LAU37_RS14470,"[(tAGTACGAATGTTCTa, -47, 4.0), (cAGTGGGGGCATTt...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*10/2=1.0 p...",10.210154,20.212213,TAGTACGAATGTTCTA,-48.0,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,...,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",,"WP_010999034.1,gene:lexA(alr4908),locus_tag:PC...",alr4908,identity:81.095%|length:201|evalue:1.01e-124,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
LAU37_RS14595,"[(tAGTTGGCGTCTCTAGCTt, -39, 2.2), (tAGTACTACTT...","[(1 pt: per AGT, 0.2*10/2=1.0 pts: 10 basi pa...",10.422293,19.279943,TAGTACATATGTACTA,-208.0,WP_250121243.1,ERF family protein,,,...,,,,,,,,,,
LAU37_RS21505,"[(gAGTACACTTGTACTg, -28, 4.2), (tAGAGTACACTTGT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",9.179889,17.917018,GAGTACACTTGTACTG,-29.0,WP_250122517.1,single-stranded DNA-binding protein,,,...,,GO:0006260 - DNA replication,"ko03032 DNA replication proteins, ko03400 DNA ...","03030 DNA replication, 03430 Mismatch repair, ...","WP_010994265.1,gene:(alr0088),locus_tag:PCC712...",alr0088,identity:80.374%|length:107|evalue:1.67e-61,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAU37_RS13085,,,-10.630395,-15.873265,ATTTTTAAGAGGAAAC,-17.0,WP_250125981.1,30S ribosomal protein S8,rpsH,['GO:0006412 - translation [Evidence IEA]'],...,['GO:0003735 - structural constituent of ribos...,,ko03011 Ribosome,03010 Ribosome,,,,,,
LAU37_RS26905,,,-10.738444,-15.877757,TCAACCCACAGCAAAG,-18.0,WP_250123482.1,DUF1822 family protein,,,...,,,,,,,,,,
LAU37_RS02120,,,-17.290457,-17.290457,CTTAAAGGAATAGAAG,-16.0,WP_250123991.1,SDR family oxidoreductase,,,...,,,,,BAL28787.1 3-oxoacyl-[acyl-carrier protein] re...,slr0886,identity:33.607%|length:244|evalue:1.82e-27,Kizawa et al. TPJ 2017,Genes whose regulation by LexA has been valida...,PCC6803
LAU37_RS22000,,,-11.853510,-17.696938,TCTAAGGAATTAAGCT,-16.0,WP_250122606.1,50S ribosomal protein L20,rplT,['GO:0006412 - translation [Evidence IEA]'],...,['GO:0003735 - structural constituent of ribos...,['GO:0000027 - ribosomal large subunit assembl...,ko03011 Ribosome,03010 Ribosome,,,,,,


In [174]:
df.to_excel("/home/davide/Desktop/CCMEE29MotiviTutticonScoreedEvidenze.xlsx",index=True)

In [173]:
def add_tuple(row):
    # Define the tuple you want to add
    new_tuple = (row["motivo"], int(row["posizione"]), round(row["score"],2), round(row["score_refined"],2))


    if type(row["motivi"]) is list:
        # If "motivi" is not NaN, append the new tuple to the existing list
        row["motivi"].insert(0,new_tuple)
    else:
        row["motivi"] = [new_tuple]

    return row

# Apply the function to each row in the DataFrame
df = df.apply(add_tuple, axis=1)
df=df.sort_values("score_refined",ascending=False)
df.drop(columns=["posizione",],inplace=True)
df

Unnamed: 0_level_0,motivi,info_score_variabile,score,score_refined,motivo,protein_id,product,gene,GO_process,GO_component,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways,sseqid,old_locus_tag,info_blastp,Reference,Note,species
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
LAU37_RS16010,"[(TAGTATATCTGCACTA, -67, 10.52, 21.03), (tAGTC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*10/2=1.0 p...",10.516146,21.032292,TAGTATATCTGCACTA,WP_250121493.1,recombinase RecA,recA,"['GO:0006281 - DNA repair [Evidence IEA]', 'GO...",['GO:0005737 - cytoplasm [Evidence IEA]'],"['GO:0003677 - DNA binding [Evidence IEA]', 'G...","['GO:0009432 - SOS response', 'GO:0006310 - DN...",ko03400 DNA repair and recombination proteins,03440 Homologous recombination,"WP_010997423.1,gene:recA(all3272),locus_tag:PC...",all3272,identity:88.889%|length:342|evalue:0.00e+00,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
LAU37_RS14470,"[(TAGTACGAATGTTCTA, -48, 10.21, 20.21), (TAGTA...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*10/2=1.0 p...",10.210154,20.212213,TAGTACGAATGTTCTA,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",,BAL28941.1 SOS function regulatory protein Lex...,sll1626,identity:50.0%|length:202|evalue:1.76e-71,"Kamei, A. et al., 2001",Genes whose regulation by LexA has been valida...,PCC6803
LAU37_RS14470,"[(TAGTACGAATGTTCTA, -48, 10.21, 20.21), (TAGTA...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*10/2=1.0 p...",10.210154,20.212213,TAGTACGAATGTTCTA,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",,"WP_010999034.1,gene:lexA(alr4908),locus_tag:PC...",alr4908,identity:81.095%|length:201|evalue:1.01e-124,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
LAU37_RS14595,"[(TAGTACATATGTACTA, -208, 10.42, 19.28), (tAGT...","[(1 pt: per AGT, 0.2*10/2=1.0 pts: 10 basi pa...",10.422293,19.279943,TAGTACATATGTACTA,WP_250121243.1,ERF family protein,,,,,,,,,,,,,
LAU37_RS21505,"[(GAGTACACTTGTACTG, -29, 9.18, 17.92), (gAGTAC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",9.179889,17.917018,GAGTACACTTGTACTG,WP_250122517.1,single-stranded DNA-binding protein,,,,,GO:0006260 - DNA replication,"ko03032 DNA replication proteins, ko03400 DNA ...","03030 DNA replication, 03430 Mismatch repair, ...","WP_010994265.1,gene:(alr0088),locus_tag:PCC712...",alr0088,identity:80.374%|length:107|evalue:1.67e-61,"Kumar et al., BBA 2018",Genes whose regulation by LexA has been valida...,PCC7120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAU37_RS13085,"[(ATTTTTAAGAGGAAAC, -17, -10.63, -15.87)]",,-10.630395,-15.873265,ATTTTTAAGAGGAAAC,WP_250125981.1,30S ribosomal protein S8,rpsH,['GO:0006412 - translation [Evidence IEA]'],['GO:0005840 - ribosome [Evidence IEA]'],['GO:0003735 - structural constituent of ribos...,,ko03011 Ribosome,03010 Ribosome,,,,,,
LAU37_RS26905,"[(TCAACCCACAGCAAAG, -18, -10.74, -15.88)]",,-10.738444,-15.877757,TCAACCCACAGCAAAG,WP_250123482.1,DUF1822 family protein,,,,,,,,,,,,,
LAU37_RS02120,"[(CTTAAAGGAATAGAAG, -16, -17.29, -17.29)]",,-17.290457,-17.290457,CTTAAAGGAATAGAAG,WP_250123991.1,SDR family oxidoreductase,,,,,,,,BAL28787.1 3-oxoacyl-[acyl-carrier protein] re...,slr0886,identity:33.607%|length:244|evalue:1.82e-27,Kizawa et al. TPJ 2017,Genes whose regulation by LexA has been valida...,PCC6803
LAU37_RS22000,"[(TCTAAGGAATTAAGCT, -16, -11.85, -17.7)]",,-11.853510,-17.696938,TCTAAGGAATTAAGCT,WP_250122606.1,50S ribosomal protein L20,rplT,['GO:0006412 - translation [Evidence IEA]'],['GO:0005840 - ribosome [Evidence IEA]'],['GO:0003735 - structural constituent of ribos...,['GO:0000027 - ribosomal large subunit assembl...,ko03011 Ribosome,03010 Ribosome,,,,,,


In [163]:
from Bio import motifs
from math import log
motivo=motifs.parse(open("/home/davide/Documents/motivo8recalexassbuvrbzoops/meme.xml"),"meme")[1]
def heterology_index(counts:dict[str:list],seqs:str) -> float:
    """
    Calcola l'heterology index rispetto a un motivo come riportato da (Lewis et al. 1994, J. Mol. Biol. 241)
    :param counts: un dizionario con il conteggio delle basi per ogni posizione del motivo
    :param seqs:  una sequenza di nucleotidi della stessa lunghezza del motivo di cui calcolare l'heterology index
    :return:    l'heterology index della sequenza rispetto al motivo
    """
    heterology = 0
    for i,c in enumerate(seqs):
        cons=max(counts[c][i] for c in counts)
        actual = counts[seqs[i]][i]
        heterology += log((cons+0.5)/(actual+0.5))
    return heterology
heterology_index(motivo.counts,motivo.consensus)
#add to df column heterology_index apllying heterology_index(motivo.counts,df["motivo"])
df["heterology_index"]=df["motivo"].apply(lambda x:heterology_index(motivo.counts,x))
df

Unnamed: 0_level_0,motivi,info_score_variabile,score,score_refined,motivo,protein_id,product,gene,GO_process,GO_component,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways,heterology_index
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LAU37_RS16010,"[(TAGTATATCTGCACTA, -67, 10.52, 21.03), (tAGTC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*10/2=1.0 p...",10.516146,21.032292,TAGTATATCTGCACTA,WP_250121493.1,recombinase RecA,recA,"['GO:0006281 - DNA repair [Evidence IEA]', 'GO...",['GO:0005737 - cytoplasm [Evidence IEA]'],"['GO:0003677 - DNA binding [Evidence IEA]', 'G...","['GO:0009432 - SOS response', 'GO:0006310 - DN...",ko03400 DNA repair and recombination proteins,03440 Homologous recombination,1.813523
LAU37_RS14470,"[(TAGTACGAATGTTCTA, -48, 10.21, 20.21), (tAGTA...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*10/2=1.0 p...",10.210154,20.212213,TAGTACGAATGTTCTA,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",,3.339579
LAU37_RS14595,"[(TAGTACATATGTACTA, -208, 10.42, 19.28), (tAGT...","[(1 pt: per AGT, 0.2*10/2=1.0 pts: 10 basi pa...",10.422293,19.279943,TAGTACATATGTACTA,WP_250121243.1,ERF family protein,,,,,,,,2.476904
LAU37_RS21505,"[(GAGTACACTTGTACTG, -29, 9.18, 17.92), (gAGTAC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",9.179889,17.917018,GAGTACACTTGTACTG,WP_250122517.1,single-stranded DNA-binding protein,,,,,GO:0006260 - DNA replication,"ko03032 DNA replication proteins, ko03400 DNA ...","03030 DNA replication, 03430 Mismatch repair, ...",7.093027
LAU37_RS30680,"[(CAGTACATTTGTACTA, -137, 10.16, 17.83), (tAGC...","[(1 pt: per ACT, 0.2*6/2=0.6 pts: 6 basi pali...",10.155354,17.826257,CAGTACATTTGTACTA,WP_250126418.1,VOC family protein,,,,,,,,4.855475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAU37_RS13085,"[(ATTTTTAAGAGGAAAC, -17, -10.63, -15.87)]",,-10.630395,-15.873265,ATTTTTAAGAGGAAAC,WP_250125981.1,30S ribosomal protein S8,rpsH,['GO:0006412 - translation [Evidence IEA]'],['GO:0005840 - ribosome [Evidence IEA]'],['GO:0003735 - structural constituent of ribos...,,ko03011 Ribosome,03010 Ribosome,36.716126
LAU37_RS26905,"[(TCAACCCACAGCAAAG, -18, -10.74, -15.88)]",,-10.738444,-15.877757,TCAACCCACAGCAAAG,WP_250123482.1,DUF1822 family protein,,,,,,,,35.556634
LAU37_RS02120,"[(CTTAAAGGAATAGAAG, -16, -17.29, -17.29)]",,-17.290457,-17.290457,CTTAAAGGAATAGAAG,WP_250123991.1,SDR family oxidoreductase,,,,,,,,52.309748
LAU37_RS22000,"[(TCTAAGGAATTAAGCT, -16, -11.85, -17.7)]",,-11.853510,-17.696938,TCTAAGGAATTAAGCT,WP_250122606.1,50S ribosomal protein L20,rplT,['GO:0006412 - translation [Evidence IEA]'],['GO:0005840 - ribosome [Evidence IEA]'],['GO:0003735 - structural constituent of ribos...,['GO:0000027 - ribosomal large subunit assembl...,ko03011 Ribosome,03010 Ribosome,37.671638


In [164]:
df.to_excel("/home/davide/Desktop/CCMEE29MotiviTutticonScore.xlsx",index=True)


In [ ]:
df=pd.erge

In [ ]:
#add to the list of df["motivi"] for every row a tuple motivo,score,score_refined,pos

In [37]:
s="tAGTACGAATGTTCTa"
print(get_score(s,1))
print(s[-2:0])

('TAG-AC----GT-CTA', 10, 8, '1 pt: per AGT, 0.2*10/2=1.0 pts: 10  basi palindrome, 1 pt: per 8 bp gap', 3.0)


In [46]:
print(s[-3:-0])



In [62]:
#parse the fasta file and find the sequence for id LAU37_RS30680
for record in SeqIO.parse(file, "fasta"):
    if record.id=="LAU37_RS30680":
        print(record.seq)
        seq=str(record.seq)
        break
        

ATTTCAACTTCCGTTGTGTGTCCTTGATCACCTGCGTGTGCCTGCAACGGTCTTGGGGTATTGAACTGTCTAGAAACGATTTAGAACAACTTCAACACGCTTTGTGGTCTTTCTTAAGATAGATTTTGCCAGTCAGCAGATGGGTTGCAGTACATTTGTACTACTGCCGGGTTCTGAGTAGTCCCATCATGTCACGCGGTTCCTAGTCCCTTGGGACGGCTTCGCCTAAGGAGTAGCAATCCGCACTAATCGCATTCCTCAATCCTTTATTCAAGGAGTACACC


In [75]:
motifs=[]
seq="AGTAAAAACTACTACTACT"
import regex as re
motivi= [
    "AGT.{4,10}ACT"]
for m in motivi:
    for match in re.finditer(m, seq, overlapped=True):
        x=match.group()
        s,e=match.start(),match.end()
        scores=get_score(x,1)
        x=x[:1].lower()+x[1:-1]+x[-1:].lower()
        motifs.append((x,s,scores[-1]))
        print(scores[-1])
print(motifs)
print(set(motifs))
matches=re.findall(motivi[0],seq,overlapped=True)
print(matches)

2.0
[('aGTAAAAACTACTACt', 0, 2.0)]
{('aGTAAAAACTACTACt', 0, 2.0)}
['AGTAAAAACTACTACT']
