The aim of this script is to find a specific transcription factor binding site on the promoters of potato genes. In this case, the E2F transcription factor is used as an example, which binds to sequence variants of the E2F consensus sequence. For this, previously a table (GENE_ID_1500BP_PROMOTER_DICTIONARY.csv) has been created, which contains the gene identification numbers and promoter sequnces of each gene, see Python-pandas-potato-promoter-dictionary-project. Another table was also made before, which contains all the possible sequence variants of the E2F binding sites. With a nested loop, the transcription factor column was iterated through the promoter sequences and with this a combined table was produced that contains the gene id, the particular E2F binding sequence variant and the promoter sequence. In the next step, position of the binding sites is also indenified (indexing) and added to the table. These positions are related to the 3' end of the promoter, which corresponds to the translational start sites.

In [1]:
import pandas as pd

In [2]:
""" import csv """

data1 = pd.read_csv("GENE_ID_1500BP_PROMOTER_DICTIONARY.csv")
promoters_df = pd.DataFrame(data1)
print("number of rows in promoters_df:",promoters_df.shape[0])
promoters_df.head(3)

number of rows in promoters_df: 32819


Unnamed: 0,gene_id,sequence
0,Soltu.DM.01G000030,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...
1,Soltu.DM.01G000040,ATATATTTTCGTTCCCAGATCTCGCTCGCCACTCTCCCAGATCGCG...
2,Soltu.DM.01G000060,CTGATATGGAAGTGTGAAGGTGTACTAGACTACTAGTCAAATATTA...


In [3]:
""" import csv """

data2 = pd.read_csv("E2F_sites_listed.csv")
E2F_binding_df = pd.DataFrame(data2)
print("number of rows in E2F_binding_df:", E2F_binding_df.shape[0])
E2F_binding_df.head(3)

number of rows in E2F_binding_df: 64


Unnamed: 0,E2F_consensus
0,ATTGGCGG
1,ATTGGCGC
2,ATTGGCCG


-----


## Nested loop to iterate through indices and rows of E2F_binding_df

In [4]:
""" iteration of dfs """

# first is to create a new empty df 
df_combined = pd.DataFrame(columns = ["gene_id", "E2F_consensus", "sequence"])

count=0  # creating the 'count' variable for the 'for loop'

for index, row in promoters_df.iterrows():    # iterating through the index and rows of df#1
    sequence = row.sequence      # creating a variable that will be iterating through df#2

    for index_2, row_2 in E2F_binding_df.iterrows():    # a nested for loop iterates through indexes and rows of df#2
        if row_2.E2F_consensus in sequence:
            df_combined.loc[count] = (row.gene_id, row_2.E2F_consensus, sequence) 
            # .loc will access the columns of the new df
            count+=1    # increase count value

In [5]:
print("number of rows in df_combined:", df_combined.shape[0])
df_combined.head(3)

number of rows in df_combined: 13883


Unnamed: 0,gene_id,E2F_consensus,sequence
0,Soltu.DM.01G000030,TTTGGCGG,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...
1,Soltu.DM.01G000070,ATTGGCCC,ATGGGAATATTGAAATTTATTTTAGTTACTGCAGTCATTTCTTTAG...
2,Soltu.DM.01G000170,GGGGGAAA,TAGAGGTAAGGTCTGTGTACACGTTACTCTCATACACTATTTGTGT...


https://stackoverflow.com/questions/62876811/find-index-of-substring-within-string-from-a-dataframe

## Indexing E2F_sites

In [6]:
""" creating a new column with the indices of substrings """

df_combined['position'] = df_combined[['sequence', 'E2F_consensus']].apply(lambda s: str.index(*s), axis=1)
df_combined

Unnamed: 0,gene_id,E2F_consensus,sequence,position
0,Soltu.DM.01G000030,TTTGGCGG,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...,167
1,Soltu.DM.01G000070,ATTGGCCC,ATGGGAATATTGAAATTTATTTTAGTTACTGCAGTCATTTCTTTAG...,697
2,Soltu.DM.01G000170,GGGGGAAA,TAGAGGTAAGGTCTGTGTACACGTTACTCTCATACACTATTTGTGT...,83
3,Soltu.DM.01G000210,CGGGGAAT,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,959
4,Soltu.DM.01G000210,CGGCGAAA,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,85
...,...,...,...,...
13878,Soltu.DM.12G029980,GGGGGAAT,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,1182
13879,Soltu.DM.12G029980,GGGCCAAA,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,780
13880,Soltu.DM.12G030000,TTTCCCCC,GAATTTAATATAAAATCTTGAGGATAAACTTGTGTGTTGGAATATT...,924
13881,Soltu.DM.12G030170,TTTGCCCC,CCCTGGTAAAGATGGAAATGTTACTTCCACAAGTAGGCTTCTTCCA...,1479


In [7]:
""" creating the 'position from ATG' column """

df_combined['Position_from_ATG'] = df_combined['position']-1500
df_combined

Unnamed: 0,gene_id,E2F_consensus,sequence,position,Position_from_ATG
0,Soltu.DM.01G000030,TTTGGCGG,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...,167,-1333
1,Soltu.DM.01G000070,ATTGGCCC,ATGGGAATATTGAAATTTATTTTAGTTACTGCAGTCATTTCTTTAG...,697,-803
2,Soltu.DM.01G000170,GGGGGAAA,TAGAGGTAAGGTCTGTGTACACGTTACTCTCATACACTATTTGTGT...,83,-1417
3,Soltu.DM.01G000210,CGGGGAAT,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,959,-541
4,Soltu.DM.01G000210,CGGCGAAA,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,85,-1415
...,...,...,...,...,...
13878,Soltu.DM.12G029980,GGGGGAAT,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,1182,-318
13879,Soltu.DM.12G029980,GGGCCAAA,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,780,-720
13880,Soltu.DM.12G030000,TTTCCCCC,GAATTTAATATAAAATCTTGAGGATAAACTTGTGTGTTGGAATATT...,924,-576
13881,Soltu.DM.12G030170,TTTGCCCC,CCCTGGTAAAGATGGAAATGTTACTTCCACAAGTAGGCTTCTTCCA...,1479,-21


In [8]:
""" combine two columns """

df_combined["Position_from_ATG"] = df_combined.Position_from_ATG.astype(str)
df_combined["E2F_elements"] = df_combined["E2F_consensus"]  + " (" + df_combined["Position_from_ATG"] + ")"
df_combined

Unnamed: 0,gene_id,E2F_consensus,sequence,position,Position_from_ATG,E2F_elements
0,Soltu.DM.01G000030,TTTGGCGG,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...,167,-1333,TTTGGCGG (-1333)
1,Soltu.DM.01G000070,ATTGGCCC,ATGGGAATATTGAAATTTATTTTAGTTACTGCAGTCATTTCTTTAG...,697,-803,ATTGGCCC (-803)
2,Soltu.DM.01G000170,GGGGGAAA,TAGAGGTAAGGTCTGTGTACACGTTACTCTCATACACTATTTGTGT...,83,-1417,GGGGGAAA (-1417)
3,Soltu.DM.01G000210,CGGGGAAT,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,959,-541,CGGGGAAT (-541)
4,Soltu.DM.01G000210,CGGCGAAA,TGATGCACTATGTTATTCCTGAGTACCACTTTCTTATGTGAACATT...,85,-1415,CGGCGAAA (-1415)
...,...,...,...,...,...,...
13878,Soltu.DM.12G029980,GGGGGAAT,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,1182,-318,GGGGGAAT (-318)
13879,Soltu.DM.12G029980,GGGCCAAA,GTTTATGTTAATTTTAATTATTTGTCAGATTTCCCCATCATAAGAG...,780,-720,GGGCCAAA (-720)
13880,Soltu.DM.12G030000,TTTCCCCC,GAATTTAATATAAAATCTTGAGGATAAACTTGTGTGTTGGAATATT...,924,-576,TTTCCCCC (-576)
13881,Soltu.DM.12G030170,TTTGCCCC,CCCTGGTAAAGATGGAAATGTTACTTCCACAAGTAGGCTTCTTCCA...,1479,-21,TTTGCCCC (-21)


In [9]:
""" merge rows with the same Gene_ID strings, the CDF1_consensus has to be in order, based on the position values"""

# Sort data by multiple columns
df_combined_2 = df_combined.sort_values(by=['gene_id', 'position'])
df_combined_2

Unnamed: 0,gene_id,E2F_consensus,sequence,position,Position_from_ATG,E2F_elements
7014,Soltu.DM.01G000010,GGGCCAAA,AGTGTTTTGCTATGTTATATGGATTATTATCATGAAATAAATAAAA...,1199,-301,GGGCCAAA (-301)
0,Soltu.DM.01G000030,TTTGGCGG,TTCAAATTTTGAAATAAAATAAATATAGATATGGAGCAAAAGTTGA...,167,-1333,TTTGGCGG (-1333)
7015,Soltu.DM.01G000050,GGGGCAAA,AAAATCCGATCGAGTTTCATGGTCATATAGATATTTGAAGATAAAT...,1430,-70,GGGGCAAA (-70)
1,Soltu.DM.01G000070,ATTGGCCC,ATGGGAATATTGAAATTTATTTTAGTTACTGCAGTCATTTCTTTAG...,697,-803,ATTGGCCC (-803)
7016,Soltu.DM.01G000160,GGGCGAAT,TTATTAAAATTAGGCCAAAAGACATTCTCCTCACATCACTTTAATT...,255,-1245,GGGCGAAT (-1245)
...,...,...,...,...,...,...
7011,Soltu.DM.12G030120,GCGGGAAA,TAAATAGTAAACCAACCTCATATCAGAATTAAGCAACACCTCAGTT...,475,-1025,GCGGGAAA (-1025)
7012,Soltu.DM.12G030130,GCGGCAAT,CACATTGAATTGGAGGATTTATGAAGACATTAGTGCAAATAATAAG...,906,-594,GCGGCAAT (-594)
7013,Soltu.DM.12G030160,TTTGCCCC,AGGGTCCAACGCACTGCTCGATAACAGCTTAAAATACAATTCTTTG...,253,-1247,TTTGCCCC (-1247)
13881,Soltu.DM.12G030170,TTTGCCCC,CCCTGGTAAAGATGGAAATGTTACTTCCACAAGTAGGCTTCTTCCA...,1479,-21,TTTGCCCC (-21)


In [10]:
""" drop multiple columns """

df_combined_3 = df_combined_2.drop(['E2F_consensus', 'sequence', 'position', 'Position_from_ATG'], axis = 1)
df_combined_3

Unnamed: 0,gene_id,E2F_elements
7014,Soltu.DM.01G000010,GGGCCAAA (-301)
0,Soltu.DM.01G000030,TTTGGCGG (-1333)
7015,Soltu.DM.01G000050,GGGGCAAA (-70)
1,Soltu.DM.01G000070,ATTGGCCC (-803)
7016,Soltu.DM.01G000160,GGGCGAAT (-1245)
...,...,...
7011,Soltu.DM.12G030120,GCGGGAAA (-1025)
7012,Soltu.DM.12G030130,GCGGCAAT (-594)
7013,Soltu.DM.12G030160,TTTGCCCC (-1247)
13881,Soltu.DM.12G030170,TTTGCCCC (-21)


In [11]:
""" merge _consensus columns (use reset index) """

df_combined_4 = df_combined_3.groupby(['gene_id'])['E2F_elements'].apply(lambda x: ', '.join(x.astype(str))).reset_index()
df_combined_4                       

Unnamed: 0,gene_id,E2F_elements
0,Soltu.DM.01G000010,GGGCCAAA (-301)
1,Soltu.DM.01G000030,TTTGGCGG (-1333)
2,Soltu.DM.01G000050,GGGGCAAA (-70)
3,Soltu.DM.01G000070,ATTGGCCC (-803)
4,Soltu.DM.01G000160,GGGCGAAT (-1245)
...,...,...
10860,Soltu.DM.12G030120,GCGGGAAA (-1025)
10861,Soltu.DM.12G030130,GCGGCAAT (-594)
10862,Soltu.DM.12G030160,TTTGCCCC (-1247)
10863,Soltu.DM.12G030170,TTTGCCCC (-21)


In [12]:
""" Export table to csv """

df_combined_4.to_csv('E2F_SITES_ON_POTATO_PROMOTERS.csv', index=False)