In [27]:
import pandas as pd


def revcomp(seq):
    """Compute the reverse complement of a sequence
    ::param seq: the sequence
    ::return: the reverse complement of the sequence
    """
    seq=seq[::-1]
    diz={'A':'T','T':'A','C':'G','G':'C'}
    seq="".join([diz[x] for x in seq])
    return seq
def get_score(s1):
    """Compute the score of the motif using alignment between sequence and reverse complement of the sequence to find palindromes and use gaps
    between the two sequences
    ::param s1: the motif
    ::return: alignment, number of matches, gap, info, score
    """
    s2=revcomp(s1)
    score=0
    info=""
    if s1[4:7]=="AGT":
        info+="1 pt: per AGT, "
        score+=1
    if s1[-7:-4]=="ACT":
        info+="1 pt: per ACT, "
        score+=1
    elif s1[-6:-3]=="CTA":
        info+="1 pt: per CTA, "
        score+=1
    pali=0
    al=""
    for i,j in zip(s1,s2):
        if i==j:
            pali+=1
            al+=i
        else:
            al+="-"
    info+=f"0.2*{pali}/2={round(0.2*pali/2,2)} pts: {pali}  basi palindrome, "
    score+=0.2*pali/2
    #score the gap between s1[7] and s1[-7]
    gap=len(s1[7:-7])
    #if the gap is  8 give max score else igive less considering the distance from 8 for example if it is 7 or 9 give hte same score but less
    if gap==8:
        info+="1 pt: per 8 bp gap"
        score+=1
    elif gap==7 or gap==9:
        info+=f"0.5 pt: per {gap} bp gap"
        score+=0.5
    else:
        info+=f"0.2 pt: per {gap} bp gap"
        score+=0.2
    return al,pali,gap,info,score

df_ann=pd.read_csv("/home/davide/Desktop/CCMEE29Annotazioni.csv")
#rename column sequence_name to locus_tag
df_ann=df_ann.rename(columns={"sequence_name":"locus_tag"})


In [28]:
import re
file="/home/davide/PycharmProjects/TESI2/intergeniche_RefSeq/Chroococcidiopsis_sp._CCMEE_29_GCF_023558375_intergen.fasta"
import re

# Define the motifs
motivi= [
    ".{4}AG[ATCG].{4,11}ACT.{4}",
    ".{4}AGT.{4,11}A[AT]T.{4}",
    ".{4}AGT.{4,11}AC[AT].{4}",
    ".{4}AGT.{4,11}[TG]CT.{4}",
    ".{4}TGT.{4,11}ACT.{4}"
]
from Bio import SeqIO
#parse the fasta file
diz={}
infos={}
j=0
for record in SeqIO.parse(file, "fasta"):
    seq=str(record.seq)
    #use the finditer method to find all the matches of the regular expression
    for m in motivi:
        for match in re.finditer(m, str(seq)):
            x=match.group()
            s,e=match.start(),match.end()
            scores=get_score(x)
            x=x[:4].lower()+x[4:-4]+x[-4:].lower()
            # print(x,s,e,seq[s],seq[e-1])
            #store the match in a dictionary
            pos=s-len(seq)+4
            if record.id in diz:
                diz[record.id].append((x,round(scores[-1],2),pos,scores[-2]))
                infos[record.id].append((scores[-2],))
            else:
                diz[record.id]=[(x,round(scores[-1],2),pos,scores[-2])]
                infos[record.id]=[(scores[-2],)]
df=pd.DataFrame(diz.items(),columns=["locus_tag","motivi"])
#remove duplicates from the list of motifs
df["motivi"]=df["motivi"].apply(lambda x:list(set(x)))
df["motivi"]=df["motivi"].apply(lambda x:sorted(x,key=lambda y:y[1],reverse=True))
#extract the fourth element for every tuple in the list motivi annd make new column info_score
df["info_score"]=df["motivi"].apply(lambda x:[(y[3],) for y in x])
#delete the fourth element from the list motivi
df["motivi"]=df["motivi"].apply(lambda x:[y[:3] for y in x])
df["max_score"]=df["motivi"].apply(lambda x:x[0][1])
df=df.sort_values("max_score",ascending=False)
df

Unnamed: 0,locus_tag,motivi,info_score,max_score
4552,LAU37_RS31200,"[(acatAGTTCAAATGAACTatgt, 5.0, -83), (ctgaAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*20/2=2.0 p...",5.0
1017,LAU37_RS14595,"[(tactAGTACATATGTACTataa, 4.8, -207), (ttttAGT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",4.8
3075,LAU37_RS14590,"[(ttatAGTACATATGTACTagta, 4.8, -157), (acatAGC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",4.8
614,LAU37_RS09175,"[(agtgAGTGTGAGAAAACTcact, 4.6, -76), (tatcAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6
1519,LAU37_RS21095,"[(aaccAGTACTTAATAACTggta, 4.6, -26), (cctgAGCA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6
...,...,...,...,...
2722,LAU37_RS09445,"[(gatgAGTATGCAATggcg, 1.6, -21)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6
677,LAU37_RS09890,"[(tggaAGTAACGATTcggg, 1.6, -14)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6
2888,LAU37_RS11720,"[(tcgtAGTTACGATTgaag, 1.6, -124)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6
2646,LAU37_RS08415,"[(tgctAGTCCCTCGCTcctt, 1.6, -17)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6


In [29]:
#merge df with df_ann on the locus_tag and index
df=pd.merge(df,df_ann,on="locus_tag")
df

Unnamed: 0,locus_tag,motivi,info_score,max_score,protein_id,product,gene,GO_process,GO_component,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways
0,LAU37_RS31200,"[(acatAGTTCAAATGAACTatgt, 5.0, -83), (ctgaAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*20/2=2.0 p...",5.0,WP_250126493.1,DEAD/DEAH box helicase,,,,['GO:0003676 - nucleic acid binding [Evidence ...,,,
1,LAU37_RS14595,"[(tactAGTACATATGTACTataa, 4.8, -207), (ttttAGT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",4.8,WP_250121243.1,ERF family protein,,,,,,,
2,LAU37_RS14590,"[(ttatAGTACATATGTACTagta, 4.8, -157), (acatAGC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",4.8,WP_250121242.1,FAD-dependent oxidoreductase,,,,['GO:0016491 - oxidoreductase activity [Eviden...,,,
3,LAU37_RS09175,"[(agtgAGTGTGAGAAAACTcact, 4.6, -76), (tatcAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6,WP_250125273.1,glycosyltransferase,,,,,,,
4,LAU37_RS21095,"[(aaccAGTACTTAATAACTggta, 4.6, -26), (cctgAGCA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6,WP_250122450.1,hypothetical protein,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4369,LAU37_RS09445,"[(gatgAGTATGCAATggcg, 1.6, -21)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6,WP_275983384.1,sugar O-acetyltransferase,,,,,,,
4370,LAU37_RS09890,"[(tggaAGTAACGATTcggg, 1.6, -14)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6,WP_250125408.1,ATP-dependent sacrificial sulfur transferase LarE,larE,,,['GO:0016783 - sulfurtransferase activity [Evi...,,,
4371,LAU37_RS11720,"[(tcgtAGTTACGATTgaag, 1.6, -124)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6,WP_250125738.1,AraC family transcriptional regulator,,,,,,ko03000 Transcription factors,
4372,LAU37_RS08415,"[(tgctAGTCCCTCGCTcctt, 1.6, -17)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",1.6,WP_250125131.1,HAD family hydrolase,,,,,,,


In [30]:
df["KEGG_brite"] = df["KEGG_brite"].fillna('')
df_filtered = df[df["KEGG_brite"].str.contains("DNA repair")]
df_filtered

Unnamed: 0,locus_tag,motivi,info_score,max_score,protein_id,product,gene,GO_process,GO_component,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways
5,LAU37_RS00385,"[(caacAGTGCAAATGTACTgtta, 4.6, -60), (cttcAGTT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6,WP_250123666.1,SMC family ATPase,,,,,"['GO:0006302 - double-strand break repair', 'G...",ko03400 DNA repair and recombination proteins,
19,LAU37_RS01815,"[(aaatAGTTCATCTGTACTattc, 4.6, -48), (cgttAGTC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",4.6,WP_250123933.1,DNA polymerase III subunit gamma/tau,,,,,"['GO:0071897 - DNA biosynthetic process', 'GO:...","ko03032 DNA replication proteins, ko03400 DNA ...","03030 DNA replication, 03430 Mismatch repair, ..."
75,LAU37_RS08865,"[(tctcAGTCACTTGTTACTgtac, 4.2, -95), (ttcgAGCT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",4.2,WP_250125213.1,RecQ family ATP-dependent DNA helicase,,['GO:0006310 - DNA recombination [Evidence IEA]'],,['GO:0004386 - helicase activity [Evidence IEA]'],GO:0006310 - DNA recombination,"ko03019 Messenger RNA biogenesis, ko03400 DNA ...",03018 RNA degradation
80,LAU37_RS14470,"[(ttgtAGTACGAATGTTCTatag, 4.2, -47), (tgccAGTG...","[(1 pt: per AGT, 1 pt: per CTA, 0.2*12/2=1.2 p...",4.2,WP_250121220.1,transcriptional repressor LexA,lexA,['GO:0006282 - regulation of DNA repair [Evide...,,['GO:0004252 - serine-type endopeptidase activ...,,"ko01002 Peptidases and inhibitors, ko03400 DNA...",
97,LAU37_RS23580,"[(aaagAGTTATTTATCACTcaaa, 4.2, -48), (gggtAGTT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",4.2,WP_250122904.1,primosomal protein N',priA,['GO:0006260 - DNA replication [Evidence IEA]'...,,['GO:0003678 - DNA helicase activity [Evidence...,['GO:0006268 - DNA unwinding involved in DNA r...,ko03400 DNA repair and recombination proteins,03440 Homologous recombination
101,LAU37_RS03995,"[(acaaAGTCAAACTAAACTaggt, 4.2, -121), (aactAGG...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",4.2,WP_250124336.1,DNA repair protein RecN,recN,"['GO:0006281 - DNA repair [Evidence IEA]', 'GO...",['GO:0005737 - cytoplasm [Evidence IEA]'],['GO:0005524 - ATP binding [Evidence IEA]'],"['GO:0006310 - DNA recombination', 'GO:0006281...",ko03400 DNA repair and recombination proteins,
134,LAU37_RS16010,"[(aaatAGTATATCTGCACTagtc, 4.2, -66), (atctAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",4.2,WP_250121493.1,recombinase RecA,recA,"['GO:0006281 - DNA repair [Evidence IEA]', 'GO...",['GO:0005737 - cytoplasm [Evidence IEA]'],"['GO:0003677 - DNA binding [Evidence IEA]', 'G...","['GO:0009432 - SOS response', 'GO:0006310 - DN...",ko03400 DNA repair and recombination proteins,03440 Homologous recombination
141,LAU37_RS16880,"[(atgtAGTAAATTTGTACTaatc, 4.2, -101), (acgaTGT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*12/2=1.2 p...",4.2,WP_250121656.1,ATP-dependent RecD-like DNA helicase,,,,['GO:0004386 - helicase activity [Evidence IEA]'],"['GO:0032508 - DNA duplex unwinding', 'GO:0006...",ko03400 DNA repair and recombination proteins,03440 Homologous recombination
204,LAU37_RS16025,"[(tagcAGTAAAACTTCACTctag, 4.0, -35), (tagcAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*10/2=1.0 p...",4.0,WP_250121496.1,ATP-dependent DNA helicase,,,,,,ko03400 DNA repair and recombination proteins,
205,LAU37_RS18910,"[(ttatAGTACATTGTTACTacct, 4.0, -40), (gagcAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*10/2=1.0 p...",4.0,WP_250122036.1,deoxyribodipyrimidine photo-lyase,,,,,['GO:0006139 - nucleobase-containing compound ...,ko03400 DNA repair and recombination proteins,


In [31]:
dfeviPCC7120=pd.read_csv("/home/davide/Desktop/eviPCC7120.csv")
dfeviPCC6803=pd.read_csv("/home/davide/Desktop/eviPCC6803.csv")
dfeviPCC7120["species"]="PCC7120"
dfeviPCC6803["species"]="PCC6803"
dfevi=pd.concat([dfeviPCC7120,dfeviPCC6803])

In [32]:
#merge df with dfeviPCC7120 on the protein_id
df=pd.merge(df,dfevi,left_on="protein_id",right_on="qseqid",how="left")
df.drop(columns=["max_score","qseqid"],inplace=True)
df.to_excel("/home/davide/Desktop/CCMEE29MotiviEvi.xlsx",index=False)
df

Unnamed: 0,locus_tag,motivi,info_score,protein_id,product,gene,GO_process,GO_component,GO_function,GO_process_pannzer2,KEGG_brite,KEGG_pathways,sseqid,old_locus_tag,score,Reference,Note,species
0,LAU37_RS31200,"[(acatAGTTCAAATGAACTatgt, 5.0, -83), (ctgaAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*20/2=2.0 p...",WP_250126493.1,DEAD/DEAH box helicase,,,,['GO:0003676 - nucleic acid binding [Evidence ...,,,,,,,,,
1,LAU37_RS14595,"[(tactAGTACATATGTACTataa, 4.8, -207), (ttttAGT...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",WP_250121243.1,ERF family protein,,,,,,,,,,,,,
2,LAU37_RS14590,"[(ttatAGTACATATGTACTagta, 4.8, -157), (acatAGC...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*18/2=1.8 p...",WP_250121242.1,FAD-dependent oxidoreductase,,,,['GO:0016491 - oxidoreductase activity [Eviden...,,,,,,,,,
3,LAU37_RS09175,"[(agtgAGTGTGAGAAAACTcact, 4.6, -76), (tatcAGTA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",WP_250125273.1,glycosyltransferase,,,,,,,,BAL28615.1 hypothetical protein sll1723 SYNGTI...,sll1723,identity:32.0%|length:175|evalue:1.97e-14,"Kizawa et al., Front. microbiol. 2016",Genes whose regulation by LexA has been valida...,PCC6803
4,LAU37_RS21095,"[(aaccAGTACTTAATAACTggta, 4.6, -26), (cctgAGCA...","[(1 pt: per AGT, 1 pt: per ACT, 0.2*16/2=1.6 p...",WP_250122450.1,hypothetical protein,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4389,LAU37_RS09445,"[(gatgAGTATGCAATggcg, 1.6, -21)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",WP_275983384.1,sugar O-acetyltransferase,,,,,,,,,,,,,
4390,LAU37_RS09890,"[(tggaAGTAACGATTcggg, 1.6, -14)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",WP_250125408.1,ATP-dependent sacrificial sulfur transferase LarE,larE,,,['GO:0016783 - sulfurtransferase activity [Evi...,,,,,,,,,
4391,LAU37_RS11720,"[(tcgtAGTTACGATTgaag, 1.6, -124)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",WP_250125738.1,AraC family transcriptional regulator,,,,,,ko03000 Transcription factors,,,,,,,
4392,LAU37_RS08415,"[(tgctAGTCCCTCGCTcctt, 1.6, -17)]","[(1 pt: per AGT, 0.2*4/2=0.4 pts: 4 basi pali...",WP_250125131.1,HAD family hydrolase,,,,,,,,,,,,,


In [33]:
df_filtered=pd.merge(df_filtered,dfevi,left_on="protein_id",right_on="qseqid",how="left")
df_filtered.drop(columns=["max_score","qseqid"],inplace=True)
df_filtered.to_excel("/home/davide/Desktop/CCMEE29MotiviEviRiparo.xlsx",index=False)


In [34]:
df.to_excel("/home/davide/Desktop/CCMEE29Motivivariabili.xlsx",index=True)