In [2]:
import pandas as pd
from Bio import SeqIO
import re
from Bio.Seq import Seq

### First we need to get intergenic regions from every genome
#### This can be done with different tools

In [None]:
# e.g with bash script

# #!/bin/bash

# ref_fasta=$1
# ref_gff=$2

# tag=$(basename $ref_fasta)

# awk '$3 != "region"' $ref_gff > ${tag}_2.gff
# gff2bed < ${tag}_2.gff > ${tag}_genome_sorted.bed
# bioawk -c fastx '{print $name"\t"length($seq)}' $ref_fasta > ${tag}_genome.bed
# awk '!seen[$3]++' ${tag}_genome_sorted.bed > ${tag}_genome_sorted_2.bed
# bedtools complement -i ${tag}_genome_sorted_2.bed -g ${tag}_genome.bed > ${tag}_intergenic.bed
# echo "DONE"

In [2]:
intervals = pd.read_csv('intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals

Unnamed: 0,contig,start,end
0,contig_2,0,586
1,contig_2,1426,1797
2,contig_2,2328,2561
3,contig_2,4022,4040
4,contig_2,6956,7424
...,...,...,...
3434,contig_4,4677270,4677418
3435,contig_4,4683050,4683090
3436,contig_4,4684353,4684811
3437,contig_4,4687811,4687894


In [3]:
intervals.replace(to_replace = 0, value = 1, inplace=True)

In [4]:
intervals = intervals[intervals.end-intervals.start >= 20]

In [5]:
intervals

Unnamed: 0,contig,start,end
0,contig_2,1,586
1,contig_2,1426,1797
2,contig_2,2328,2561
4,contig_2,6956,7424
5,contig_2,8645,8740
...,...,...,...
3434,contig_4,4677270,4677418
3435,contig_4,4683050,4683090
3436,contig_4,4684353,4684811
3437,contig_4,4687811,4687894


In [134]:
def fragment_extract(species, fasta, contig, start, end):
    """
    extracts genome fragment for each pair of coordinates given
    writes output in fasta file
    :return: None
    """
    with open(fasta, 'r', encoding='utf-8') as inp:
        records = list(SeqIO.parse(inp, "fasta"))
    for rec in records:
        if rec.id == contig:
            gen_slice = rec.seq[start-1:end]
            with open(f'{species}_intergenic.fasta', 'a') as out:
                out.write(f'>{contig}*{start}-{end}\n{gen_slice}\n')

In [8]:
for _, row in intervals.iterrows():
    fragment_extract('b_uniformis','/home/nastya/Bacteroides_uniformis_annotations/metatranscriptomics/b_uni_type.fa', row.contig, row.start, row.end)

new assembly
ASM1106v1 - b theta
ASM1829216v1 - b uniformis CL
ASM2073544v1 - prevotella copri
ASM358576v1 - pontibacter actiniarum
ASM1251635v1 - Chitinophaga oryzae
ASM1227305v1 - Parabacteroides distasonis
ASM2009150v1 - b faecis
ASM1829212v1 - Bacteroides cellulosilyticus

In [11]:
intervals = pd.read_csv('intergenic/b_theta_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('b_theta','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', row.contig, row.start, row.end)

In [12]:
intervals = pd.read_csv('intergenic/b_uni_CL_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('b_uni_CL','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_uni_CL.fasta', row.contig, row.start, row.end)

In [13]:
intervals = pd.read_csv('intergenic/C_oryzae_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('C_oryzae','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/C_oryzae.fna', row.contig, row.start, row.end)

In [14]:
intervals = pd.read_csv('intergenic/p_actiniarum_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('p_actiniarum','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/p_actiniarum.fasta', row.contig, row.start, row.end)

In [15]:
intervals = pd.read_csv('intergenic/p_copri_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('p_copri','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/p_copri.fasta', row.contig, row.start, row.end)

In [16]:
intervals = pd.read_csv('intergenic/P_distasonis_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('P_distasonis','/home/nastya/Bacteroides_uniformis_annotations/genome_foot/P_distasonis.fasta', row.contig, row.start, row.end)

In [135]:
intervals = pd.read_csv('intergenic/p_actiniarum_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]
for _, row in intervals.iterrows():
    fragment_extract('p_actiniarum','p_actiniarum.fasta', row.contig, row.start, row.end)

In [None]:
# use gff to find coordinates of canonical SOS genes

In [81]:
data = pd.read_csv('b_theta.gff3', comment='#', sep='\t', names=['ref', 'prodigal', 'CDS', 'start', 'end', 'row', 'strand', 'row2', 'annot'], engine='python')
data = data[data.CDS == 'gene']
data

Unnamed: 0,ref,prodigal,CDS,start,end,row,strand,row2,annot
1,NC_004663.1,RefSeq,gene,93,710,.,+,.,ID=gene-BT_RS00005;Name=BT_RS00005;gbkey=Gene;...
3,NC_004663.1,RefSeq,gene,783,1778,.,+,.,ID=gene-BT_RS00010;Name=BT_RS00010;gbkey=Gene;...
5,NC_004663.1,RefSeq,gene,1872,2648,.,+,.,ID=gene-BT_RS00015;Name=BT_RS00015;gbkey=Gene;...
7,NC_004663.1,RefSeq,gene,2783,3451,.,+,.,ID=gene-BT_RS00020;Name=BT_RS00020;gbkey=Gene;...
9,NC_004663.1,RefSeq,gene,3700,3996,.,+,.,ID=gene-BT_RS00025;Name=rpsQ;gbkey=Gene;gene=r...
...,...,...,...,...,...,...,...,...,...
9763,NC_004663.1,RefSeq,gene,6255871,6256884,.,+,.,ID=gene-BT_RS24060;Name=traJ;gbkey=Gene;gene=t...
9767,NC_004663.1,RefSeq,gene,6257528,6257818,.,+,.,ID=gene-BT_RS24070;Name=BT_RS24070;gbkey=Gene;...
9769,NC_004663.1,RefSeq,gene,6257802,6258647,.,+,.,ID=gene-BT_RS24075;Name=BT_RS24075;gbkey=Gene;...
9771,NC_004663.1,RefSeq,gene,6258660,6259064,.,+,.,ID=gene-BT_RS24080;Name=traM;gbkey=Gene;gene=t...


In [82]:
data = data[['start', 'end', 'strand','annot']]
data

Unnamed: 0,start,end,strand,annot
1,93,710,+,ID=gene-BT_RS00005;Name=BT_RS00005;gbkey=Gene;...
3,783,1778,+,ID=gene-BT_RS00010;Name=BT_RS00010;gbkey=Gene;...
5,1872,2648,+,ID=gene-BT_RS00015;Name=BT_RS00015;gbkey=Gene;...
7,2783,3451,+,ID=gene-BT_RS00020;Name=BT_RS00020;gbkey=Gene;...
9,3700,3996,+,ID=gene-BT_RS00025;Name=rpsQ;gbkey=Gene;gene=r...
...,...,...,...,...
9763,6255871,6256884,+,ID=gene-BT_RS24060;Name=traJ;gbkey=Gene;gene=t...
9767,6257528,6257818,+,ID=gene-BT_RS24070;Name=BT_RS24070;gbkey=Gene;...
9769,6257802,6258647,+,ID=gene-BT_RS24075;Name=BT_RS24075;gbkey=Gene;...
9771,6258660,6259064,+,ID=gene-BT_RS24080;Name=traM;gbkey=Gene;gene=t...


In [83]:
data = data.reset_index()[['start', 'end', 'strand','annot']]
data

Unnamed: 0,start,end,strand,annot
0,93,710,+,ID=gene-BT_RS00005;Name=BT_RS00005;gbkey=Gene;...
1,783,1778,+,ID=gene-BT_RS00010;Name=BT_RS00010;gbkey=Gene;...
2,1872,2648,+,ID=gene-BT_RS00015;Name=BT_RS00015;gbkey=Gene;...
3,2783,3451,+,ID=gene-BT_RS00020;Name=BT_RS00020;gbkey=Gene;...
4,3700,3996,+,ID=gene-BT_RS00025;Name=rpsQ;gbkey=Gene;gene=r...
...,...,...,...,...
4739,6255871,6256884,+,ID=gene-BT_RS24060;Name=traJ;gbkey=Gene;gene=t...
4740,6257528,6257818,+,ID=gene-BT_RS24070;Name=BT_RS24070;gbkey=Gene;...
4741,6257802,6258647,+,ID=gene-BT_RS24075;Name=BT_RS24075;gbkey=Gene;...
4742,6258660,6259064,+,ID=gene-BT_RS24080;Name=traM;gbkey=Gene;gene=t...


In [84]:
data[['index', '1', '2', '3', '4', '5', '6', '7']] = data.annot.str.split(";", expand = True)
data

Unnamed: 0,start,end,strand,annot,index,1,2,3,4,5,6,7
0,93,710,+,ID=gene-BT_RS00005;Name=BT_RS00005;gbkey=Gene;...,ID=gene-BT_RS00005,Name=BT_RS00005,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS00005,old_locus_tag=BT0001%2CBT_0001,,
1,783,1778,+,ID=gene-BT_RS00010;Name=BT_RS00010;gbkey=Gene;...,ID=gene-BT_RS00010,Name=BT_RS00010,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS00010,old_locus_tag=BT0002%2CBT_0002,,
2,1872,2648,+,ID=gene-BT_RS00015;Name=BT_RS00015;gbkey=Gene;...,ID=gene-BT_RS00015,Name=BT_RS00015,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS00015,old_locus_tag=BT0003%2CBT_0003,,
3,2783,3451,+,ID=gene-BT_RS00020;Name=BT_RS00020;gbkey=Gene;...,ID=gene-BT_RS00020,Name=BT_RS00020,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS00020,old_locus_tag=BT0004%2CBT_0004,,
4,3700,3996,+,ID=gene-BT_RS00025;Name=rpsQ;gbkey=Gene;gene=r...,ID=gene-BT_RS00025,Name=rpsQ,gbkey=Gene,gene=rpsQ,gene_biotype=protein_coding,locus_tag=BT_RS00025,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4739,6255871,6256884,+,ID=gene-BT_RS24060;Name=traJ;gbkey=Gene;gene=t...,ID=gene-BT_RS24060,Name=traJ,gbkey=Gene,gene=traJ,gene_biotype=protein_coding,locus_tag=BT_RS24060,old_locus_tag=BT4774%2CBT_4774,
4740,6257528,6257818,+,ID=gene-BT_RS24070;Name=BT_RS24070;gbkey=Gene;...,ID=gene-BT_RS24070,Name=BT_RS24070,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS24070,old_locus_tag=BT4777%2CBT_4777,,
4741,6257802,6258647,+,ID=gene-BT_RS24075;Name=BT_RS24075;gbkey=Gene;...,ID=gene-BT_RS24075,Name=BT_RS24075,gbkey=Gene,gene_biotype=protein_coding,locus_tag=BT_RS24075,old_locus_tag=BT4778%2CBT_4778,,
4742,6258660,6259064,+,ID=gene-BT_RS24080;Name=traM;gbkey=Gene;gene=t...,ID=gene-BT_RS24080,Name=traM,gbkey=Gene,gene=traM,gene_biotype=protein_coding,locus_tag=BT_RS24080,old_locus_tag=BT4779%2CBT_4779,


In [None]:
data = data[['index','start', 'end', 'strand']]

In [32]:
data['index'] = data['index'].str.strip('ID=gene-')
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['index'] = data['index'].str.strip('ID=gene-')


Unnamed: 0,index,start,end,strand
0,BT_RS00005,93,710,+
1,BT_RS00010,783,1778,+
2,BT_RS00015,1872,2648,+
3,BT_RS00020,2783,3451,+
4,BT_RS00025,3700,3996,+
...,...,...,...,...
4739,BT_RS24060,6255871,6256884,+
4740,BT_RS24070,6257528,6257818,+
4741,BT_RS24075,6257802,6258647,+
4742,BT_RS24080,6258660,6259064,+


In [13]:
gen = pd.read_csv('gene_names_Lucia_2.tsv', sep='\t')
gen

Unnamed: 0,index,gene
0,BT_RS00005,WP_011107050.1
1,BT_RS00010,WP_009039994.1
2,BT_RS00015,WP_009039993.1
3,BT_RS00020,WP_008647129.1
4,BT_RS00025,rpsQ
...,...,...
4747,BT_RS24070,WP_009040011.1
4748,BT_RS24075,WP_011109426.1
4749,BT_RS24080,traM
4750,BT_RS24085,traN


In [33]:
gen_borders = pd.merge(data, gen, on='index', how='left')
gen_borders

Unnamed: 0,index,start,end,strand,gene
0,BT_RS00005,93,710,+,WP_011107050.1
1,BT_RS00010,783,1778,+,WP_009039994.1
2,BT_RS00015,1872,2648,+,WP_009039993.1
3,BT_RS00020,2783,3451,+,WP_008647129.1
4,BT_RS00025,3700,3996,+,rpsQ
...,...,...,...,...,...
4739,BT_RS24060,6255871,6256884,+,traJ
4740,BT_RS24070,6257528,6257818,+,WP_009040011.1
4741,BT_RS24075,6257802,6258647,+,WP_011109426.1
4742,BT_RS24080,6258660,6259064,+,traM


In [24]:
gene_list = ['recA', 'dinB', 'recN', 'nusA', 'dnaK', 'ruvA', 'uvrB', 'uvrA', 'polA']

In [45]:
gen_borders[gen_borders.gene.isin(gene_list)]

Unnamed: 0,index,start,end,strand,gene
551,BT_RS02795,705641,707674,-,uvrB
559,BT_RS02835,714379,717204,+,uvrA
1354,BT_RS06890,1688236,1689903,-,recN
1736,BT_RS08815,2144331,2147102,+,uvrA
1970,BT_RS10020,2488641,2489246,-,ruvA
3252,BT_RS16515,4162078,4164927,-,polA
3393,BT_RS17225,4381917,4383236,+,nusA
4381,BT_RS22240,5809713,5810804,+,dinB
4580,BT_RS23235,6035657,6036649,-,recA
4585,BT_RS23260,6041039,6042955,+,dnaK


In [26]:
intervals = pd.read_csv('intergenic/b_theta_intergenic.bed', sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]

In [27]:
intervals

Unnamed: 0,contig,start,end
0,NC_004663.1,1,92
1,NC_004663.1,710,782
2,NC_004663.1,1778,1871
3,NC_004663.1,2648,2782
4,NC_004663.1,3451,3699
...,...,...,...
4416,NC_004663.1,6254778,6254800
4417,NC_004663.1,6255208,6255239
4418,NC_004663.1,6255842,6255870
4419,NC_004663.1,6256884,6256914


In [70]:
def fragment_extract(fasta, contig, start, end):
    """
    extracts genome fragment for each pair of coordinates given
    """
    with open(fasta, 'r', encoding='utf-8') as inp:
        records = list(SeqIO.parse(inp, "fasta"))
    for rec in records:
        if rec.id == contig:
            gen_slice = rec.seq[start-1:end]
    return(gen_slice)

In [80]:
for _,row in gen_borders[gen_borders.gene.isin(gene_list)].iterrows():
    if row.strand == '+':
        coord = int(row.start)-1
        sel_data = intervals[intervals.end == coord]
        while sel_data.empty:
            coord -=1
            sel_data = intervals[intervals.end == coord]
        fin_sequence = fragment_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
    if row.strand == '-':
        coord = int(row.end)
        sel_data = intervals[intervals.start == coord]
        while sel_data.empty:
            coord +=1
            sel_data = intervals[intervals.end == coord] 
        sequence = fragment_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
        my_dna = Seq(sequence)
        fin_sequence = my_dna.reverse_complement()
    print(f'>{row.gene}\n{fin_sequence}')

>uvrB
GTTCTCTTGGTATTATTAGTTGTATGAATCTGTTTCATGATATTAATATGTTGCAAAGTAAAGGATTTATTTGGAAAAATGATTTGTTATCTCAATTATTTTGCCGAATATTGCTCCACTTTTTAGGATTTAACTA
>uvrA
TCAATTTATAACTATTGGTTCTTGAAATCAAGGAACAAAGGTACGAATAATGCCCTATATCTTCATAAGATTTGAGTTAAGAAAATAATATTGTGCTTTTTTAGGAGCATATTTCAGTCGGAAACATTAATTTTGTACCGTTTATGAAGGATATGAAT
>recN
TCTTTATTTTTTAGGTTTAATTTTTAATTTACAGAAGG
>uvrA
TCCGGATGTTTTTCACGATTCAAGTCAATTTGTTATTATTTTGAAATGTTTGTTGTAAATTCCGGACCAAGATACGCCAATTATCCCGAAGCAATTTCTTTTTATTACCAATTATAACATATCTTAACCCGACCAAAAACCTTTCTTCTATCAATAAAATCGCTAACTTTGCCGACTATCATAAAAAAGGAGTATTTT
>ruvA
TCTTTTATTGGTTTTATATGAACATTTTTCTTTAATTCTGCCACAAAAGTAAACAAATTACTAGGAAGTTTTGTTTACTGCCTGCATTATTTTTAATTTTTTCACCTCTGACTGCGAGGCGGAAATGTTGCGAAAGTCTTTTCATCTGTGGTGAATTTGTCTTATCTTTGCAAGGTATA
>polA
TCACAACACTATTTGAGCACAAAAGTAATAATAAAACGCTTAATACGGTATTTTTTTGTACTTTTACCATGAAAAATTAATTAAATCTA
>nusA
GATATTGGTGTTTGTGCCTGATATACGACTATTAATTCATACTTTTTTTCTTTAATGTTTTGTAGTCGAATAGATTATTTATATCTTTGCAACCGAAATGGATGAAAGGTGGAGGGGCGATTAAGCTCCTTTTTTTGTTCTTATATAGTTAATA

In [None]:
#Combining into 1 function

In [1]:
def gene_upstream_extract(fasta, gff, intergenic_bed, gene_list):
    '''
    '''
    def gene_names(gff):
        '''
        '''
        data = pd.read_csv(gff, comment='#', sep='\t', names=['ref', 'prodigal', 'CDS', 'start', 'end', 'row', 'strand', 'row2', 'annot'], engine='python')
        data = data[data.CDS == 'CDS'][['annot']]
        transc_dict = {}
        for _,row in data.iterrows():
            text=row.annot
            ID = re.compile(r'ID=([A-Za-z0-9_.-]+)') 
            name = re.compile(r'gene=([A-Za-z0-9_-]+)') 
            BT = re.compile(r'Parent=([A-Za-z0-9_-]+)') 
            findid = ID.findall(text.strip())[0]
            findbt = BT.findall(text.strip())[0]
            try:
                findname = name.findall(text.strip())[0]
            except:
                findname = findid
            transc_dict.update({findbt:findname})
        id_name = pd.DataFrame.from_dict(transc_dict, orient='index', columns=['gene'])
        id_name = id_name.reset_index()
        id_name.gene = id_name.gene.str.replace('cds-', '')
        id_name['index'] = id_name['index'].str.replace('gene-', '')
        return id_name
    
    def fragment_extract(fasta, contig, start, end):
        '''
        extracts genome fragment for each pair of coordinates given
        '''
        with open(fasta, 'r', encoding='utf-8') as inp:
            records = list(SeqIO.parse(inp, "fasta"))
        for rec in records:
            if rec.id == contig:
                gen_slice = rec.seq[start-1:end]
        return(gen_slice)

    data = pd.read_csv(gff, comment='#', sep='\t', names=['ref', 'prodigal', 'CDS', 'start', 'end', 'row', 'strand', 'row2', 'annot'], engine='python')
    if not data[data.CDS == 'gene'].empty:
        data = data[data.CDS == 'gene']
    data = data[['start', 'end', 'strand','annot']]
    data = data.reset_index()[['start', 'end', 'strand','annot']]
    col_num = data.annot.str.split(";", expand = True).shape[1]
    cols = ['index'] + [i for i in range(1, col_num)]
    data[cols] = data.annot.str.split(";", expand = True)
    data = data[['index','start', 'end', 'strand']]
    data['index'] = data['index'].str.replace('ID=gene-', '')
    
    gen = gene_names(gff)
    gen_borders = pd.merge(data, gen, on='index', how='left')

    intervals = pd.read_csv(intergenic_bed, sep='\t', names=['contig', 'start', 'end'])
    intervals.replace(to_replace = 0, value = 1, inplace=True)
    intervals = intervals[intervals.end-intervals.start >= 20]
    
    species = str(fasta).split('/')[-1].split('.')[0]
    gene_count = list()
    for _,row in gen_borders[gen_borders.gene.isin(gene_list)].iterrows():
        if row.strand == '+':
            coord = int(row.start)-1
            sel_data = intervals[intervals.end == coord]
            while sel_data.empty:
                coord -=1
                sel_data = intervals[intervals.end == coord]
            fin_sequence = fragment_extract(fasta, sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
        if row.strand == '-':
            coord = int(row.end)
            sel_data = intervals[intervals.start == coord]
            while sel_data.empty:
                coord +=1
                sel_data = intervals[intervals.end == coord] 
            sequence = fragment_extract(fasta, sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
            my_dna = Seq(sequence)
            fin_sequence = my_dna.reverse_complement()
        gene_count.append(row.gene)
        count_num = gene_count.count(row.gene)
        if count_num > 1:
            gene_name = row.gene + str(count_num)
        else:
            gene_name = row.gene
        print(f'>{gene_name}_{species}\n{fin_sequence}')

In [139]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', gene_list)

>uvrB_b_theta
GTTCTCTTGGTATTATTAGTTGTATGAATCTGTTTCATGATATTAATATGTTGCAAAGTAAAGGATTTATTTGGAAAAATGATTTGTTATCTCAATTATTTTGCCGAATATTGCTCCACTTTTTAGGATTTAACTA
>uvrA_b_theta
TCAATTTATAACTATTGGTTCTTGAAATCAAGGAACAAAGGTACGAATAATGCCCTATATCTTCATAAGATTTGAGTTAAGAAAATAATATTGTGCTTTTTTAGGAGCATATTTCAGTCGGAAACATTAATTTTGTACCGTTTATGAAGGATATGAAT
>recN_b_theta
TCTTTATTTTTTAGGTTTAATTTTTAATTTACAGAAGG
>uvrA_b_theta
TCCGGATGTTTTTCACGATTCAAGTCAATTTGTTATTATTTTGAAATGTTTGTTGTAAATTCCGGACCAAGATACGCCAATTATCCCGAAGCAATTTCTTTTTATTACCAATTATAACATATCTTAACCCGACCAAAAACCTTTCTTCTATCAATAAAATCGCTAACTTTGCCGACTATCATAAAAAAGGAGTATTTT
>ruvA_b_theta
TCTTTTATTGGTTTTATATGAACATTTTTCTTTAATTCTGCCACAAAAGTAAACAAATTACTAGGAAGTTTTGTTTACTGCCTGCATTATTTTTAATTTTTTCACCTCTGACTGCGAGGCGGAAATGTTGCGAAAGTCTTTTCATCTGTGGTGAATTTGTCTTATCTTTGCAAGGTATA
>polA_b_theta
TCACAACACTATTTGAGCACAAAAGTAATAATAAAACGCTTAATACGGTATTTTTTTGTACTTTTACCATGAAAAATTAATTAAATCTA
>nusA_b_theta
GATATTGGTGTTTGTGCCTGATATACGACTATTAATTCATACTTTTTTTCTTTAATGTTTTGTAGTCGAATAGATTATTTATATCTTTGCAACCGAAA

In [141]:
gene_upstream_extract('faecis.fasta','faecis.gff3', 'intergenic/faecis_intergenic.bed', gene_list)

>polA_faecis
TCATAACACTATTTGAGCACAAAAGTAATAATAAAACGCTTAATACGGTATTTTTTTGTACTTTTACCATGAAAAATTAATTAAATCTA
>nusA_faecis
TGGTTCCTGTTTTTTTAATTTGCGGTGTAAAGATACGGATATACGACTGTTAATTCATACTTTTTTTCTTTAATGTTTTGTAGTCGAATAGATTATTTATATCTTTGCAACCGAAATGGATGAAAGGTGGAGGGGCGATTAAGCTCCTTTTTTTGTTCTTATATAGTTAATAA
>dinB_faecis
TAATACTAAATTTGTTTTATGGTTTGTACCTCAAATGTTTTCCTGTTGACAGATTATGGATAACAATATACAAATATACTATTATTAGAAATGGGATCGCAATGATATCGAATCTTTTTTATGACCTGTGTCTCTAAAAAAATAAATGCATATTATTCTAGAGATCTATTTATTCGGAAAATTGTAGTAAGTTTGCTCTGATAATCGAAGATA
>recA_faecis
ATCCATCAATTAGAAGAATAAGGTA
>dnaK_faecis
GACAAAAAAGAGGAACGACTATTTTATATTAAAAAAAGAAGGGGATGTGCCTTAGTTTTGACACAGCCCCATATTTTTTTATTCCTTTCTGATGAATTCGTTTTCTTTATTGAACCAGAGCCGTTTCCATTTGTTTAAAGAAGCAGTTGCTGCCCGGCTTCTGTATGACTCTGCCAGCAGTAATTTAGATTTGAAACTGTCGTTTTGGCAGAAAATATGTCATGTATCCGTCTGATTTCTTTTTGGCACATGGTTTGTTTTTTAATAAGCGTCCGTTTGAAAGCTCTTACAAAGAGAAAGAAAACAACCGGACAAAAAGAAATTGAATAATAATAAATAAAAAGAATAACGATC
>uvrA_faecis
CAATTTATAACTATTGGTTCTTGAAATCAAGGAACAAAGGTACGAATAATGCCCTATATCTTC

In [142]:
gene_upstream_extract('b_uni_CL.fasta','b_uni_CL.gff3', 'intergenic/b_uni_CL_intergenic.bed', gene_list)

>uvrA_b_uni_CL
GTTCGGACATTAAAAGGAGTCGGATGCGAATTTAGTGCTTTTATTTAATGTAAATTAGCAGTCATCCTAAAGTATAACAAATCTTAACCTTGCCTATCTGATACGATTTTTCCAGCAGGTTCGTTTTATTCTACCCCGGTTTGCATTATCTTTGCAAATCCTAAATCACATAAGTGACTTTTATA
>uvrB_b_uni_CL
GTTCTTCGTTGTATTACAATTATTAATGGTTGTTATCAATATAGCTGCAAAGTAAACAAAAAAATTTTGTTTATCTCAAAGATTTTACCGAATATTGCCGCTTCTAAGAAGAGAAAA
>uvrA_b_uni_CL
TTCAGTAATCGTTGGTTTATGTTTACAATTTCCCAAAGTAGCTGCAAAGGTACGAATATTGTGGCATTTCTGCCAAAGGATTGCGTTAAGAAATAATATTGTGCATTTCAAAACTGAAAAATAGCGCTTTTCGGCTGATATTCGGGGGGAAACAGTTATTTTTGCAGTTAAAACGATAGGATAAATCAAC
>recN_b_uni_CL
TAAATCGTCAAATCGTCAATATCCTTA
>dnaK_b_uni_CL
ACTCATCTTATTTATTTCTTTAATATGTGAAAACCAATCATTTCCGTTGCTACGATTAAGCCCAATGTTAACTAATCCTTAATCGTTCCACCATGGTGGAACGATATTTCCCCCGTGGTGGAATGATATTTTCTCCACGCTGGAACAATATTTCCTCCATGGAGGAACGGTTAAAGATATGCAAGGAAAATGATTAATGTTGTTCATGTTTTATTATCTCTGATTCGGGCGACAAAAGTAGAGCTTTTGAATGGAATGGAAATTAACAAATGGTAATTTCCGACAGAAAAAATGATTTTAATTCTGCCAAACTGACAGACAAACTGCCACAAGTATAAATCTGCGGCTGCTCTGAACAATTTGGCACAACGTTTGTCTTTTTATTGGCGTCTGCCACTGAGG

In [143]:
gene_upstream_extract('p_actiniarum.fasta','p_actiniarum.gff3', 'intergenic/p_actiniarum_intergenic.bed', gene_list)

>uvrA_p_actiniarum
TGGTAAAAGTTATTATTTGGCGCAATATATTAAATAAGAAATACAGCAATAGTGTTATTTATTAAAATTTAAACGCCCTTGCTCCCGGCCTCTCAACCATGGGCACACGCACCGGAAACTCACGCCAAAGCGTTCCTCGAACCGGCTACAGCCCAAGCCCTGTTTTATACTTGGTATGAAGCGGTTAGAGGAAGGCTGTTCCTGCCAGGCGGTCCTGAGCGGCGGAGTGTTGCAGCGGTAGCCGGCCTGCCGCAGGTGCTTGCAATTTACCCTCAGAACTCGCACAGGAGGGAGATTTGCCCTTGCCTGTTCCGGCGGGCCGCATAAATCTGCTTATATTCGTGCA
>dinB_p_actiniarum
TCTTATTGTCTTTTTGGTGTGAGGTCGAATATACGCAGCAAGCGGCAGGAAGTACAGCCCTGTGGCAGCATTGATAGAGGCAAAGGCTATACTTCCTGCTGCAGTTTGCCTACCTTTGCACCTCAACCTAAGCCATT
>dnaK_p_actiniarum
GCAGCACGGCAGTGCGGCTGGCAGAGCCCCGCGACCGGCTCAGAAAATATGTCGCAGTGCGAGCGCGTACTTTCCGACAAGCCGTTCCGTGTCCGGCCACTGGTGCCCCTCTCCTGACATAATGACATAATCTCCCCTATATTTCCCCCTTGGCACATACCTTGTTAAAACAGTAGTAACCATAAAAAGAATTTAAATCAGCAACTATAAATAAAACTATAGA
>recN_p_actiniarum
CGCCTACTGGTGCAGCCGCCGGCCAACCTTACCCCAGAATTACCTGTAAATCCCATACGTCAAAAAAAGAACAGAATTACCGAACTTTAGCGTAGGCCAAAAGTCTTCTTATGTCTATAGTCTTGTGTCATAGTATA
>nusA_p_actiniarum
CAAAAGCAAACTTACGAATTATGCACCTATCTCTACCATGTCCGGAGGCAGAATATCT

In [144]:
gene_upstream_extract('b_cellulosilyticus.fasta','b_cellulosilyticus.gff3', 'intergenic/b_cellulosilyticus_intergenic.bed', gene_list)

>polA_b_cellulosilyticus
GCATCGCTATTTTCCCACAAAAATAGTAATAAAACACTTAATACGGTATTTTTTTGTACTTTTACCATGAAAAAAAGGATAAATATA
>nusA_b_cellulosilyticus
AAAATATATAAAAGTTCAGGTACATTTTACCTCCTATGGATAGAGCCAATAGTTCCCTTATTTCGAACTATTGGCTTTCTAGTATGCAGAAAGATAGTAAAAAGCCTGTATTACTTACGCTCCGTTTTTATAAAAGAGTGGAAATAGCATCTTGTTCTGTCTTCTAAAGGAATATAAATACGGAATATTGAATAATATACTGTCGTTATACAGACTTTATGCAAGATAACTATGCATCTTACGCCTCAAAACTATATATCTAATGTCTGCTATCTATATATCTAACGCTTACGATCTATATATCTAATACTTGCCATCTATATATGGAACGCCTACGATCCATATATCCAATGCAAACCAACAGTTTATGGTTGCCAAACCGCGTCTGTTTTATTTTTCAGCATTGTCCGTATTTATGTATCCCTCTGTCAGTCTGCCGGATACACTTTTTTACTTGCGTATTTCCATTCAGACCGGCGGAAGGACGGATATACGGCATTCTCCTTATGCATAAAATACAAAATGTCAGAATCGACCTTGCCATATCGAAGCGGTTTTACAGAAGTCATTTGCATTACTTTCTTCTTTCAGGAGCGTTAAATCATTCTTTTTTTCTCTAATATTTTGTAGTCGAATAGATTATTTATATCTTTGCAACCGCAATGGATGAAAGGTGGAGGGGCAATTAAGCTCCTTTTTTTGTTCTTATATAGTTAATCA
>ruvA_b_cellulosilyticus
TCTTTTATTGAATTTGTTTATCAGTTTTCTATTTTATCGGCACAAAAGTAATCATTTTCATTCAATTTAACCACAAAACGATTCAGTGTTCCACAGAGTTTTTTCTTTATTAAGAC

In [None]:
# lexA time
CL WP_005825301.1
theta1 WP_008767434.1
theta2 WP_011108539.1
faecis1 WP_055270926.1
faecis2 WP_010535945.1
ponti WP_025606010.1
B cellulosilyticus 

In [145]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', ['WP_008767434.1','WP_011108539.1'])

>WP_011108539.1_b_theta
TTCCGGTTATTCTTTTTGATATACTTGATATGTTTTCAGATAAAAGGTAAGGCAGCCTGCGCACAGGAGGTTCACATCCTACACAATGATCATTGGCAACTTGCCCGTCAGATGTTAACAGCCATCACGTCAGCTGTTAATACCCAATATAACAACTGTTAACTCCTGACAAATCAACTATTAACAGCTGACAGCTCTCTTTTCGGAGGGGGAGAATGCGGTTTTTGAAAGTATTTCGCATCTTTTTCTTGTTTTCTCATGGTTTATCTAATTTTTATTTCTAATTTTGAAATCAAATATTCATTTCGAATACCACAAAGGTATAGTAAGTCGCATATATAAAACAAATAATTTTAGATATTAACGCTGAATTATTTTACAAAAGACTGTATATGAGGCGATAAGGCAAGAATAAATATTAACATATAAAAAAGTAAGCAAAAAC
>WP_008767434.1_b_theta
ATTGATTTTGCGTTTTAAAGTTTTCATGCTTAAATATTTGTTTATGTGATTATCGGACATATAGTTCCTGCATATTGGGGATATATGGCCAGCGTATTGGATCTATAGCTCCTGTATGCAGGATATATAGATGCTGTAAACCGACTCAATATCACTGGTGTTCATCCGATACAGAATCTTCCGGAAGGGGGTGGATGAGGCATTTTACAACCTTTCATAAACTTTTTGCCGTTTTTCATGGTTATTCTATTTTTAATTTATATTTTTGAAACATATTTCTAAGTTTGCTGCTACAAAGATAAAGCAGTTTTCTCACATAAAACAAACGGTTTTAGAAATTAACGCTAAAAAATTATATAACTGACTATCAATGAACGAATAAGAAGTTCCAACATCTTTACACTAATTTAAAACAAGAATAAAA


In [146]:
gene_upstream_extract('faecis.fasta','faecis.gff3', 'intergenic/faecis_intergenic.bed', ['WP_055270926.1', 'WP_010535945.1'])

>WP_055270926.1_faecis
TTTTGGTTATTCTTTTGATGTAATTGATATGTTTTCAGATGAAAGGCAAGGGAGCCTGCATGCAAGAAGTTCACATCCTGCACAATGACTACCGGCAATCCGGCTGCCAGATGTTAACAATTATCATGTCAGTTATTAATATCCAATATGACAACTATTAACTCCTGACAAGTCAGCTATTAACAGCTGACAGACTCCTTTTTACATGGAGGAGAATGCGATTTTGGAGAGTCTTTCGCATCTTTTTCTCGTTTTCTCATGGTTTATCTAATTTTTATTCCTACTTTTGAAACAAAATACTCATTTTGAATACCACAAAGATAAAGAAAGTCGCATATATAAAACAATTAATTTTAGATATTAACGCTGAATTATTTTGCAAAAGACTGTATATGAGGCAGTAACATAAGAATAAATATTAACAAATAAAATAATAAACAAAAAC
>WP_010535945.1_faecis
GTTATCCATAATCAAAATATTACATAATGTTGACGGTGTGTATGTTTGTATGTTCCCCTATAACAGCTATTTAGAATGATACCGGTCAATTTGCCGGCAAGTACCGGTCAGGCTATTGGCCGGTACCGGTCAAGCTGCCGAATGGTACCCTCCGACAGGTTGGCAAGTACCATTCATCCAGGCGATGTGTACCTTGTCTAAGAGCTATTGAAAACAGCCATTTTACAATTTTCTATAAAATTCTCCTTCATATTCATAGTTATTACCTAATTTATGCCTATTTTTGAAACATATTAAATTTTAAATCGTCACAAATATAATACATGTTTTATATATAAAACAAATAGTTTTTAAAATTAAGAATGTTAAAACAATAAATAAACTGAGGATCAATATAATATAAACATTATTCACACCACTAAAAATATAACAAGAATAATA


In [147]:
gene_upstream_extract('b_uni_CL.fasta','b_uni_CL.gff3', 'intergenic/b_uni_CL_intergenic.bed', ['WP_005825301.1'])

>WP_005825301.1_b_uni_CL
TAAATCCCTCCCTTTTATTTGCATTTATAAAACATGATATGTATTTTTGTCCCAACAAAGATAATGCTGTTTTCATTTATAAAACAAACAAATCGGGAATTAAATTCTTAAATAAGACGTAAGACAGTATAAATGAGGAGATAATAGTTTCTGAATAATTTACACGTAAAAACTGAAGAAT


In [183]:
gene_upstream_extract('p_actiniarum.fasta','p_actiniarum.gff3', 'intergenic/p_actiniarum_intergenic.bed', ['WP_025606010.1'])

>WP_025606010.1_p_actiniarum
AGCTGAAATTAATTTTAGGGGGTTTAAAGTTTTTAGTGTATCGCGTAACTTTTTCTTAAAGAATACGTAACCGGGTGCTTATTCAAAAGGTTCACACGCACCTGCTTATTTTCAAAAATTTTGGCGCTTCGACTGTGAAATGTACAAACCACAGGCTTTGTAAGCTGTAGGGCATACTTTTCCCCTGTAATCCGGAGGAAAAACGTCTGTAATTGCACAAAATGCCAGATAGTGAAATTAAAGGAGAGTGTAGCTGTTTTGCATATATTCGTATTCGTTAAAGTAGTATGGCGTAAAAGTGTGCTTATGTGAGCATAAATGCAAGAAAAGGTTCGGTGTTGTGCAAGTATAATGTTGTACTTAATAAAGAATATTGAATTTTATTGGAAGGGTTAAATTTTACTACCGTCTAAGTGATATAGGAATATACTGTGGTTTTAAAATATTGCTAATAATATTAGTATAAAAGCTAAAGTTTAATTAGTTTTACAGGCGTGCAGCTATACTGTGGCTGCAGCATCAGAAAACCTAAAGAAGCAGATA


In [176]:
gene_list = ['ahpF',
 'BACUNI_03743',
 'BACUNI_03652',
 'BACUNI_03643',
 'BACUNI_03512',
 'BACUNI_03481',
 'BACUNI_03391',
 'BACUNI_03387',
 'BACUNI_03182',
 'BACUNI_03757',
 'BACUNI_03083',
 'BACUNI_02788',
 'BACUNI_02748',
 'BACUNI_02713',
 'BACUNI_02648',
 'BACUNI_02645',
 'BACUNI_02644',
 'BACUNI_02643',
 'BACUNI_02460',
 'BACUNI_02789',
 'BACUNI_03818',
 'BACUNI_03819',
 'BACUNI_03859',
 'otsB',
 'mraZ',
 'grpE',
 'groS',
 'groL',
 'dnaK',
 'clpB',
 'bc2018--bc2018___80_02675',
 'bc2018--bc2018___80_02478',
 'bc2018--bc2018___80_01083',
 'bc2018--bc2018___80_00697',
 'BACUNI_04692',
 'BACUNI_04593',
 'BACUNI_04543',
 'BACUNI_04535',
 'BACUNI_04263',
 'BACUNI_04043',
 'BACUNI_04040',
 'BACUNI_03980',
 'BACUNI_02455',
 'BACUNI_02454',
 'BACUNI_03319',
 'BACUNI_02222',
 'BACUNI_00850',
 'BACUNI_00770',
 'BACUNI_00769',
 'BACUNI_00754']

In [177]:
fasta = '/home/nastya/Bacteroides_uniformis_annotations/metatranscriptomics/b_uni_type_4only.fa'
gff = '/home/nastya/Bacteroides_uniformis_annotations/metatranscriptomics/b_uni_type_4only.gff3'
intergenic_bed = 'intergenic/b_uni_type_intergenic.bed'
gen= pd.read_csv('egg_gene_names.tsv', sep='\t')

In [194]:
data = pd.read_csv(gff, comment='#', sep='\t', names=['ref', 'prodigal', 'CDS', 'start', 'end', 'row', 'strand', 'row2', 'annot'], engine='python')
if not data[data.CDS == 'gene'].empty:
    data = data[data.CDS == 'gene']
data = data[['start', 'end', 'strand','annot']]
data = data.reset_index()[['start', 'end', 'strand','annot']]
col_num = data.annot.str.split(";", expand = True).shape[1]
cols = ['index'] + [i for i in range(1, col_num)]
data[cols] = data.annot.str.split(";", expand = True)
data = data[['index','start', 'end', 'strand']]
data['index'] = data['index'].str.replace('ID=gene-', '')

In [195]:
data

Unnamed: 0,index,start,end,strand
0,ID=contig_4_ncRNA1,1642844,1645710,+
1,ID=contig_4_ncRNA2,3830278,3833144,-
2,ID=contig_4_ncRNA3,490503,493369,+
3,ID=contig_4_ncRNA4,3136801,3139667,-
4,ID=contig_4_ncRNA5,1640830,1642352,+
...,...,...,...,...
3877,ID=bc2018--bc2018___80_03908,4678480,4681635,+
3878,ID=bc2018--bc2018___80_03909,4681632,4683050,+
3879,ID=bc2018--bc2018___80_03910,4683091,4684353,+
3880,ID=bc2018--bc2018___80_03911,4684812,4687811,-


In [196]:
data['index'] = data['index'].str.replace('ID=', '')

In [197]:
data

Unnamed: 0,index,start,end,strand
0,contig_4_ncRNA1,1642844,1645710,+
1,contig_4_ncRNA2,3830278,3833144,-
2,contig_4_ncRNA3,490503,493369,+
3,contig_4_ncRNA4,3136801,3139667,-
4,contig_4_ncRNA5,1640830,1642352,+
...,...,...,...,...
3877,bc2018--bc2018___80_03908,4678480,4681635,+
3878,bc2018--bc2018___80_03909,4681632,4683050,+
3879,bc2018--bc2018___80_03910,4683091,4684353,+
3880,bc2018--bc2018___80_03911,4684812,4687811,-


In [193]:
gen

Unnamed: 0,index,Gene
0,bc2018--bc2018___80_00001,bc2018--bc2018___80_00001
1,bc2018--bc2018___80_00002,BACUNI_00026
2,bc2018--bc2018___80_00003,BACUNI_00025
3,bc2018--bc2018___80_00004,BACUNI_00020
4,bc2018--bc2018___80_00005,BACUNI_00054
...,...,...
3890,bc2018--bc2018___80_03908,BACUNI_04008
3891,bc2018--bc2018___80_03909,BACUNI_04007
3892,bc2018--bc2018___80_03910,BACUNI_04006
3893,bc2018--bc2018___80_03911,secD


In [198]:
gen_borders = pd.merge(data, gen, on='index', how='left')
gen_borders

Unnamed: 0,index,start,end,strand,Gene
0,contig_4_ncRNA1,1642844,1645710,+,contig_4_ncRNA1
1,contig_4_ncRNA2,3830278,3833144,-,contig_4_ncRNA2
2,contig_4_ncRNA3,490503,493369,+,contig_4_ncRNA3
3,contig_4_ncRNA4,3136801,3139667,-,contig_4_ncRNA4
4,contig_4_ncRNA5,1640830,1642352,+,contig_4_ncRNA5
...,...,...,...,...,...
3877,bc2018--bc2018___80_03908,4678480,4681635,+,BACUNI_04008
3878,bc2018--bc2018___80_03909,4681632,4683050,+,BACUNI_04007
3879,bc2018--bc2018___80_03910,4683091,4684353,+,BACUNI_04006
3880,bc2018--bc2018___80_03911,4684812,4687811,-,secD


In [199]:
intervals = pd.read_csv(intergenic_bed, sep='\t', names=['contig', 'start', 'end'])
intervals.replace(to_replace = 0, value = 1, inplace=True)
intervals = intervals[intervals.end-intervals.start >= 20]

In [200]:
intervals = intervals[intervals['contig']=='contig_4']
intervals

Unnamed: 0,contig,start,end
12,contig_4,1,276
13,contig_4,978,1037
14,contig_4,2138,2230
15,contig_4,3785,4031
16,contig_4,4298,4453
...,...,...,...
3434,contig_4,4677270,4677418
3435,contig_4,4683050,4683090
3436,contig_4,4684353,4684811
3437,contig_4,4687811,4687894


In [202]:
def fragment_extract(fasta, contig, start, end):
    '''
    extracts genome fragment for each pair of coordinates given
    '''
    with open(fasta, 'r', encoding='utf-8') as inp:
        records = list(SeqIO.parse(inp, "fasta"))
    for rec in records:
        if rec.id == contig:
            gen_slice = rec.seq[start-1:end]
    return(gen_slice)

In [205]:
species = str(fasta).split('/')[-1].split('.')[0]
for _,row in gen_borders[gen_borders.Gene.isin(gene_list)].iterrows():
    if row.strand == '+':
        coord = int(row.start)-1
        sel_data = intervals[intervals.end == coord]
        while sel_data.empty:
            coord -=1
            sel_data = intervals[intervals.end == coord]
        fin_sequence = fragment_extract(fasta, sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
    if row.strand == '-':
        coord = int(row.end)
        sel_data = intervals[intervals.start == coord]
        while sel_data.empty:
            coord +=1
            sel_data = intervals[intervals.end == coord] 
        sequence = fragment_extract(fasta, sel_data.contig.values[0], sel_data.start.values[0], sel_data.end.values[0])
        my_dna = Seq(sequence)
        fin_sequence = my_dna.reverse_complement()
    print(f'>{row.Gene}_{species}\n{fin_sequence}')

>BACUNI_03980_b_uni_type_4only
AGAGGAAGAGCGAAAAGGTTCGTCAATTAGGCGAACCTTTTTTGTTTTCCCACATTATCCCACACAAAAGACAAACATTTCTATTTCATATTCAAAGATTTAACATACATTTGCCAAATGTAAAGTAAGGAGTTTCATAAAAGCACCTGTCTTTATTTATTAGAAAAACATATTTTAATCCAAACACCAAACTACAA
>mraZ_b_uni_type_4only
AAAAACATTTATCCCCCCTGGATTCTGTGGATGAATGTAATAATCAACCACTCCAGGGGGAATATTATCACTCCTCCCTCCCATCATCTTCCGGTAAGTAGTAACGTTCACAAAATAGACGGGAAGTTAACAAAAAAAGTTTCAGAAGTTAATACTAAAACAAGAAAGAGTTAAGAAAGAGCTTTATCCCAAACTTTGGTTTTTCCAAACCAAGCCTGGGGATAGAGAAACCAAGTCTTGGTTTAGATATCCGCAAAGCTGGGAAAGAAGTTTCCCACCCAAGACACCATTCCACCAACAAGCGACTTTCAAACCAAAGACCGTCATCCTCAACCGAATCTTTTCAAAAGAGACTACACGAATCCCCCACCAGCAAAGCGTTTCAAAAGCAGCTTCATTATCAAATGAATAAGCTATCAACAAGGTGTTGAAAACTTCTTCCTAAGATTTTAGTTAAAAGATTGTGGGGAATTGTGGGGAAATGTGTATTTTTACCGTCGCTTATTATAATAAGGTGAGAAA
>BACUNI_03859_b_uni_type_4only
GAGAATACGGCAACCTAAGAATTGTTTATTAATTGGCCGGATAAAAAAGTAAATATGCTTATCTTTGGGCAATCATGGGACACAAACCAAGATACCTAAAGAAAAACCGAGTAATTAATAAACAACA
>BACUNI_03819_b_uni_type_4only
AGCGGCTACAAATCATTCAATCATAATAAACTGAT

>groL_b_uni_type_4only
TTTGTAAATTGTAAATCGTTAAAATGTAAATATAGTTA
>groS_b_uni_type_4only
AAGTCAAATCATTTTTGAGGCTGCAAAGATAGCAAATATTGTTGTATACTTTGCCATTTTGTCAGCCTTTGTCTGCCAAACAATCAGCTTGTTCGTTTGGCACGGTTTTCGTAAGTTGTTTATCGCTTGTTTAAAGCAAAATTACCATTATAACAAATGTATAACATATAAAACAAAAAGATTA
>bc2018--bc2018___80_02675_b_uni_type_4only
CCACCAAAAAGCTTTTCAATTTTAGAAAACCCTTGAATTAGAGATAGTTCAAGGGTTCTTTTTTTACCCTCGCTACGCATTTCATTACATATTTGTGAAGTTGGATTTTGCCAGTTCAGTGGCTTTTTTTAAGGCTTTTGAAAAAGCCATAAATATGTATCATACTTCATTGTTTTTCAACTGTTTGACAATTTCAATCTCCGGTGTGAAAAAGTAATTTTGCACACATGACCGGATGGAGTTGAAAATGTGAAATAAATTTAAAAATTCGGTATTTCCCATCGGGATAAAAAAAGGAATCGAACCCGGGATACAAAAGGCAATTCGAACCTGATAAAAATGGGATTTTTATGGCTATTCCACAAGTCCCTCAAAAAAGCCACCGAATTTAACATTTCGTTTCACGTCCTCATGCCATCCGGTTTTGACGAAACAATTGGATGTAACACCTCTAAAAAAACAAAAGTG
>BACUNI_02643_b_uni_type_4only
AGAATAGTAATAATATAGTAAGAAAAGGGTTGCCTCGAAAATAAGTTGATATATCATTTTGTTATTTACTATCTGTATTATAGGTGTTTTTCTTGAATCAGGTGGAAATAATTATTTACTTCGTTAGTAATATTGAGCAAAATGATATATCAACTTGTTTTCGGGGTAACTCTTTTTTTTAGGATT

In [225]:
gene_list = pd.read_csv('/home/nastya/Downloads/table1.csv').wp_gene.to_list()

In [234]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', gene_list)

>WP_011107050.1_b_theta
TAGGTGCAGGTACAACTTTTGACCACTGTCAGGCATACTACGGTGAAGACGATGCATTCGAATTCTTCGGTGGTACAATAAATGCCAAGTAT
>traM_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>traJ_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>WP_011107110.1_b_theta
TATTGCTATCTTTCTTAAATCCGGGACAAATAAACATAGAAAAACAACCCCTTTCGAGATGTGTCCTCAAATGTCCCCGTTTGTCCTCATTTGTCCTATAACCCGCTATTTCACAGCATTTTTAATACCCGTTACTTTGCAACCGAAAGGTATCGGTCAGGCTAACCAAGCCCCCGCCGCCGGGAGCGTTCCAATATCCGCTAACTCCAATCCGTCCGTAGGCGGATTTTTATTTGCACTGGCAAATAGCAAGGTGTGTTCTAAGCAGCTTGAAATTCTTTCAGCAGCTTTGAACGCCTTGCCCGGACGAAGCCGGGATAGGGTCACTCCGAAGTCGTAACCCTTTAAAGAAATGAAAC
>WP_008766240.1_b_theta
ATTTTCTTTACTTTTGGGCACAAAGGTATTGTTATGATGTTACAAACGTATGTATTTTTTCTGAAATAAAAAAATAACCTTCTTTCTTATTGAGTTTCCGGCCGAGAACTTTGTGTACTCTTTAATTGTTGTCCTATTATAAAAGTGCGAACAAAGCACACAACAAAGTTCTATAAAAAAGAAAGGAAGATTA
>WP_008760514.1_b_theta
AAAATGTTAGATTACTAATATAATAATGTGTAAGCCTATTTGGTTTTGAAATATCCCTGATTCATAAAGTGCTACTAATTTGATACTTATGAATAAATATAAATATTATTTAAGAAAATGAAGTATTGGTTGATTTATGATTTATACTTTTGCTTTTGGCATTTCAGAA

>WP_011107545.1_b_theta
CATTACTTGTATCTTTCATAAAAGCCTTATGAAATGGTGATGCAAAGGTAAACATAATTTTTATATCTCAAATAAATTTCTTCAAATTTTAGTAGAGGAAAATCTATATTATAATTTCTATTCCAGATGAAAAAAAGAATCGGATATGAAAAATCGTATATTCGATTGCTGTATAATTTAAAATATACAGATAGAAATAAAATGATTCGATAAAATCCTCTGGATTTCTAATCAGATATTTTCAGAACTTAATATTATTGGATTTACTTCTTGTCAATATGTGCGATTTTATCAGTTTATGCTAACGATAAGTTAATTGTATGCTTACTAACTGATTCAATCTGAAAGATTAAAATGATTGAGATATTAGTTGTATCCGGAACAAATAATCTGTTTCACTTGTGTTTTATTCATTCCTTTTAAGGGAGCAGAACGTTCCTCAACGACCAATGTCCCGTTGGTTTAAAACCAAGGAGTCGTTCCTTTAAAAGGAAATAGCTCAAATAACGTTCTTTTTCTGTATAATCCGATAATATATATGCAAAAATATATTGAAATTCTGTATGTTATTACGCGTTTTTATGCATAATTTCTGCTGTTTGATGAAATATGTATGGGGCTATTCTTTGTGTAATCTTAAATATAGAACATTTTTTTAAGATTCTTTCTCTGTTTGTTAAAGTGAACAAACCCGTCCTCTCATCTGTTAGAAAGTCTCAATTTTAGATTTAACCTAACCAGC
>WP_011107546.1_b_theta
CATTACTTGTATCTTTCATAAAAGCCTTATGAAATGGTGATGCAAAGGTAAACATAATTTTTATATCTCAAATAAATTTCTTCAAATTTTAGTAGAGGAAAATCTATATTATAATTTCTATTCCAGATGAAAAAAAGAATCGGATATGAAAAATCGTATATTCGATTGCTGTATAATTTAAAATATACAGATAGAAATAAAATGATTCG

>WP_011107881.1_b_theta
ACAACCTTACATGTCACAATA
>ccsA2_b_theta
ATAAAAAGGTCCTTAACTTCGAAAATATCTTAGGTTTAGGAAGTAGCTATACACTTCTGAAGAGTAGGTATGTCTATATACAAACTTACCCAAGGATACGTTAATGTTCGTAAACTTGGGTAAGATTTTATAGATATTGTTTGGAGAGGACTTAGAATTCACCTTCTTTATTTTTGGTATTACAGATTTCACTTTCCTATATCAATCAGCTTTTTATCAGACAGCTTCAATTTTATAAATCTCCACTTTCCACTACCTCTTGTATCGACTACTCCCGCTATGGTAAGTCCAATATAAAACACTGCGTTTTACAAGACATAAGGCTATGCTTTAGGGAGTGAAAAGCATAGCTTTATACCTTGTAAAGCGTAGCTTTAGCTTCCATATCCTCTCTCCTTTCACCAACAACTACACTGAAGGATTACCCTAAACTCATATTAATATCATTATCTACTTGTAACGATTACAATTTTATAAAGATTTTATGTGAATATTCATTTTTATTTCTTTACTTTGCAAAAGATAACAAAACAGAAGCACGAAT
>WP_008767913.1_b_theta
GAATATTGAAGGACAGTGAAATGTATTTCACTGTCCTTTTCCTTCCCTTAATCTGACAGTTATGAAAACAATTACGATTCTTCCTCTCTTATTATGTCCGTTTCCTGCCTAAGTAACAATCATTCTCGAAAAGACATTATAATCCTCTTGAACGT
>WP_008762107.1_b_theta
AAGAAATGCTTAGAGAGTAGCAGTGGGACTGGCGGTCTTGATTTCTAGTAAATCGGACGCCAGTCTTTTTTTTAAACCAAACGTAGGTTTTCACTGTTATCATTCTATATAAATGATATATAAAGAAAGAACTTA
>WP_011107916.1_b_theta
AATAATGGGAAACGATGAAATTATTCATTA

>kdpB_b_theta
TTAGAGAAGCCCTTCCGGTAGAAATACCAGAAGGGCTTTATTGTGTTTGATACTTTCGGAATTTAAGTACACATCATCCACATACTTCAAATCTCCTGTTCTGCTAAAAAATAAACGACCGGACAATCTTTTATAATACTATTTTTCAGTATCCTTTTCAGCCTCTTTTTCCTTCAATTTCATTCTTTTTCCTTCTGTTTTATTTAAAATCCCATATTCTGCCAAGACTAAATACCCTTTCAAAATTGAACTACTTTTTCAAAATGGAAAGGTCGACTCTGTTATTATATATTTCCTTCTTCTTTATATATGTTTGATAATCAATTTATTGCCATTTCTTTTAGAGTTTTGGCACGCATTTGGCATAATACATTGCAAGTATTATAATAATAAATATTTAAAAGATTAAAGACATGGAGATAACACTATCATTTGTTTTTTGCCTCTTGAGCGGCTTTGCCTGCTTCTGGCTATTTTGGAAATGCGTAGACTGGTTTGAAAAGATCTAGAAGAGGAAAGGAGAATTAATTATGTACACAACACTATTTGTATTAGGTATTGCAATCTTCGGTTATTTGATGTATGTACTCATCAGACCCGAAAAGTTTTAAAGTTATACAGTTTAAGGTAAAAGTA
>kdpA_b_theta
TTAGAGAAGCCCTTCCGGTAGAAATACCAGAAGGGCTTTATTGTGTTTGATACTTTCGGAATTTAAGTACACATCATCCACATACTTCAAATCTCCTGTTCTGCTAAAAAATAAACGACCGGACAATCTTTTATAATACTATTTTTCAGTATCCTTTTCAGCCTCTTTTTCCTTCAATTTCATTCTTTTTCCTTCTGTTTTATTTAAAATCCCATATTCTGCCAAGACTAAATACCCTTTCAAAATTGAACTACTTTTTCAAAATGGAAAGGTCGACTCTGTTATTATATATTTCCTTCTTCTTTATATATGTTTGATAATCAATTTATTGCCAT

>WP_008767187.1_b_theta
TTTGTTAAACGTTAAATCTGAGAACTTA
>WP_008767188.1_b_theta
TTTGTTAAACGTTAAATCTGAGAACTTA
>WP_011108530.1_b_theta
TGATAGTGTTCTGGCAATTATCGAAAGTCTTTCTCTTTTAAGATGACTAAGTTTGTATCATAATTAAAAAATGAATAATGCTA
>WP_225011830.1_b_theta
TGCGAAGCAACAACTGAAGAAGATTTTTAGTGAGGGTATATTAGATTTTGCAATATTCCTATAAATCTTTAATTTATATCATAAA
>WP_011108534.1_b_theta
GGCAGCTACTATCCGTTTGCCGGTATCGATTAGCGATCCGGCAAACGGAAAAACTAAACTAGAGATATTAAATACAAATGATTATGAAAGAAAAACTATATACATTATTATTTTTTTATTGG
>WP_162303086.1_b_theta
GGCAGCTACTATCCGTTTGCCGGTATCGATTAGCGATCCGGCAAACGGAAAAACTAAACTAGAGATATTAAATACAAATGATTATGAAAGAAAAACTATATACATTATTATTTTTTTATTGG
>WP_162303087.1_b_theta
GGCAGCTACTATCCGTTTGCCGGTATCGATTAGCGATCCGGCAAACGGAAAAACTAAACTAGAGATATTAAATACAAATGATTATGAAAGAAAAACTATATACATTATTATTTTTTTATTGG
>WP_008767215.1_b_theta
TTTGTTTCTTGCTTCGCGGGAAGAAATATAAGACTAACGACTGAACTGTAACTGTAAAACAGAAAAATAAAGAAACAAGAAACAAACTAGAGCTAACTAAGGATAACCAGCAAAAGACTAACTAATGATAATTGGTTAAAGACTAGGACTTTAATAAGACAATCAACTAAACAGAGAATTATCAATTAACA
>WP_255309861.1_b_t

>WP_008767442.1_b_theta
AAATTGCATAAAATAATATTACTTCTAATTTTGCTATCAGGATTTTCGATAGAAATACCTGCGCAGAATTCTTTGCAGAATGAGATAGGTAAGCTCGAATATGACACGTTGAAGAGGGATATGGTTAAAACAAAACATCTGGACCATATAAACATTTCTGATACATCGTCAGTGAGTTGCGCAAACTACTTGTTTCGT
>WP_224200664.1_b_theta
GGATGATTATGTTATCAGCAGATCAGGAAGATTATTTAATGAGACTCCTATTGATAAGAGAGGTAAAGGCTCAACAGATAATTTATATCTTTCCAGTGATCGTTCTATTTCAGTAACTGTAAATCAGGGGTTACTTGGGGAAATGCATAGT
>WP_011108706.1_b_theta
TGTTTTTTCTTTTTTAGTTAATATCAATTCGTAGTTGCAACTATCTTTATGCTATTTTTTATGACTCAATTATTTAATAGCTTTTTCTATCCTTCAATAATCCTGTACTCAAAATACCCTTACCAAAACTTTGACCAGATTATCTGCTACAAAAGTAGAGATAATGGAGGTATCGACCTAGATATTCAATTCAGTAATAATTCAGAAATAAAAAATGCCATTTGATCTAAAAACTGAATTATTTCTGAACTCATCCATACTAAACAATCAGACAATTAGCCTTTAATTTGCAAAAAAAATAAAAGAACGATCA
>WP_008767454.1_b_theta
TGATCGTTCTTTTATTTTTTTTGCAAATTAAAGGCTAATTGTCTGATTGTTTAGTATGGATGAGTTCAGAAATAATTCAGTTTTTAGATCAAATGGCATTTTTTATTTCTGAATTATTACTGAATTGAATATCTAGGTCGATACCTCCATTATCTCTACTTTTGTAGCAGATAATCTGGTCAAAGTTTTGGTAAGGGTATTTTGAGTACAGGATTATTGAAGGATAGAAAAAGCTATTA

>WP_011108973.1_b_theta
ATAATGTATAAAAAGATATATTC
>WP_011108974.1_b_theta
ATAATGTATAAAAAGATATATTC
>WP_011108975.1_b_theta
AGCCATGGTTGCTGGAATGAAAATATCAGTGGCTATTCTAACTGTTTGTTTAAACCCTAAATTTTGTATAT
>WP_011108976.1_b_theta
AGCCATGGTTGCTGGAATGAAAATATCAGTGGCTATTCTAACTGTTTGTTTAAACCCTAAATTTTGTATAT
>WP_008762757.1_b_theta
ATTGGCATTTTTCTATCTTGTACACTTATATATTACAGGAGAAATCTCATTTCAACGCTTGCATACCATGCATATATAAAATCACTGTTAAATCACTGCTAACTCTCAAACAAAAACATCTCCCGTCTGTTACATACCCGACTTGAATATAGTAACCTAAACTTTTAAATTA
>WP_008760779.1_b_theta
AAACAAATAAAATCATCATTTCTTAAAACCTTTTAGTTCAGTTCGTGTTATTTAAATAGTTCAACAAAGAAAAGATTATAATAACCACTTAACTATTAGGTAT
>WP_008764161.1_b_theta
TATAGTATGTTTATCATGTTTCAAAATGCAAGTATAGCATATTTTCCAAAGAGAAGCATTTAAAAAAAGCTAATTCATTTTGCGTTTTCACAGAAAGCACATACATTTGCATCGTTGTTTATGCAATTATTGTTTAAGTCAACAAGTATAAAAAAATGAACTATCTCAATAAACAATTTATATGATAACGGGGTTATTTAAAAAAGAAGTATCACCTTATTAATATGTAAGATGCGA
>WP_008764197.1_b_theta
ATGCTATCCCCACTATCTTATTCTTTACTAATTGAGTTTTATGGATGTATTACCCGAATCCTTAGATCAGTAATTCCGGATAATTAAGTCTTTCGAGGTCC

>WP_022471043.1_b_theta
AGAACTTTTCGGCTTACGATAAAAATACCGGAATATCATCCGTTGAGGGGTTGGCTTTTTAGTCAACCCCTTATTTGTGCGTACTCTCTAACAACAAAACTGCTGTTTTAAGAAGAAATCTTTATTTACTCTAATAAGATTCTAATAGGATTCTAACTATTCTCTAACAGGCTCTCCGAAGGGAGCCGTCTATCTTTGTGCCCGGAAATCCGGTTACTAACTTAACTTATGTTTAATT
>WP_011109387.1_b_theta
AGAACTTTTCGGCTTACGATAAAAATACCGGAATATCATCCGTTGAGGGGTTGGCTTTTTAGTCAACCCCTTATTTGTGCGTACTCTCTAACAACAAAACTGCTGTTTTAAGAAGAAATCTTTATTTACTCTAATAAGATTCTAATAGGATTCTAACTATTCTCTAACAGGCTCTCCGAAGGGAGCCGTCTATCTTTGTGCCCGGAAATCCGGTTACTAACTTAACTTATGTTTAATT
>WP_011109389.1_b_theta
ATTTTGGCTTTTTCGATAAACAATTTTCCGTTTTATCGGTTTATTCTTATGATAACTAAATATAAACGGAA
>WP_032841205.1_b_theta
TTTTATGGTTTGGTGGTTTTATTCAAAAATGCTGCAAAGATAGTAATAATTAGGCACATACATTTTGAATCTAGGTATTAATTCCTTATATTTGGAGAATCAACCATTTTTTATTTAGTAAT
>WP_011109406.1_b_theta
ACGACAGACAGGGAATTGTTTCCCCAAGTCCGACTGCTGCATGAACTGTTGCGTGAGTATTCCAAAGACGGGAAAGAATACAAGGCATACGTCCAAAGCCTCAAAAACTCCGCTCTCACAGCCTTTTACACACCGGAACCGGTGGTAACGGCCA
>traJ4_b_theta
ATCATTCATCCTAAAAAAACAGATAAACT
>traM4_

In [3]:
gene_list = pd.read_csv('/home/nastya/Downloads/table2.csv').wp_gene.to_list()

In [4]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', gene_list)

>WP_009039976.1_b_theta
TAACAAAAAACTTTATGTCTAAAGATATAGAAACTTTGGATAATAACAAAGCCCGGGCTTGCGAAGTCTGGGCTTTGCGCATAAAAAAGGGTGTGCCAACGTTGAAGTGCACCTCAAAAGTTAGATAAAAAACTTTTGGGGTCATTTCACATTTGACACAATCTCTTTCTTTTCAACTTTCTTATACTTTTATGACCAAAGAGAAAGAAAAAGATTATGTGCGTTTGAATATGTTAAAATATGACCCGATTTGTCACGTTTAGGTGGTATTTCTTTTTTGTCGAACTAGGAATTTTGTTTTTGCGACCTACAGATTTTCCATTTCCTGTTTTCAGACTGCAAATTTACGAATAATCCATATTTTTATTTCTATATATGTAATAATAACAAATTAAAGAATTTAATAATTCCTTAATATTCCCGAAAGAAATGTGAAATAAAATATATGGAATTTTGCAGCGAGAAAAAATAGTAAGT
>WP_009039975.1_b_theta
AGTTTGGAAATATATACAAGAAAAAATGTATAGAACACAAAAATAATGAAATCAATCAATAACTAATCATTATCTT
>WP_011107068.1_b_theta
AGTTTGGAAATATATACAAGAAAAAATGTATAGAACACAAAAATAATGAAATCAATCAATAACTAATCATTATCTT
>WP_162303141.1_b_theta
ATAATCTTAAATTCTTCATATAAAACA
>WP_011107070.1_b_theta
ATCAAACTGAGATAATGAAAAAGTCAATAAATATCCTATTTTTTACGTTGATGCTTGCGGTTTTTTCTGCTTGCATCAACGATAATGAGCCGATAATGGCAAAAGCTGTACTCTCAAGTACAGAAAATCTTTCTTTCGAGTCAACTGATGCTGAGAGCCTGATAATCACCATATATGCTGATGCTGAATGGAGAGTCGATACGCCTGACTAGATTACTGT

>WP_008764996.1_b_theta
AAATTGATATTGTGGTATATATTCGAACTGCAAAGATAGTAAGGAAACACATAAGTACGATAAAAGGCTCAAATTTTCATCTTTTCTTTTTCCGTTTTAGCTCTTTCTTTTTCCGCTTCGGAAATTAAATTCCGGTTCTTTTTTCTTCTTTCCGTTACTTTCGTGCCGTTGACAACGTAACTAATGTAGAATAGAA
>WP_008764887.1_b_theta
ATCAAATTGTTTTTTTATGTTTATACCTATATTTTTATTTCACTGCAAAGATACTGCCGGATTTGAAATCTGTTTGTATATGAATTACGGATTCTTTTACCTATATTACAGTTCCGCTAAAAATGGGCTTGATAAAACATAAAATATGACTTCCTCTATATCAACCATATGCTCTTTTGCAAGCTTGATTCTTTTTACCTGAATATACTGTTGTAATAGGTATCGCGCACGCGTAATCATTATATAATAGCAAAATAGGATGCTATAATAATATCTGTTCTTGTGTGATAGAAAGATGGAAACCTATAATCTGTAATTGGGGTAATTATACCCGTAATCTGTCTACAATCCCTTCACTTAATTCTTCCGACCTTTGTACTGTCGAATCACCTTGAGTATATGGAACTAATTATATAAAGAATA
>WP_011107761.1_b_theta
GCATATACATAACTACGTAATAAAATAATAAAACCGATATACAA
>ccsA_b_theta
GCATATACATAACTACGTAATAAAATAATAAAACCGATATACAA
>WP_011107763.1_b_theta
GCATATACATAACTACGTAATAAAATAATAAAACCGATATACAA
>nrfA_b_theta
GCATATACATAACTACGTAATAAAATAATAAAACCGATATACAA
>nrfH_b_theta
ATAAAATAGCCAGCCTGCCATGCAAAACGGGGGCGGGATGGAGTATCTTTTTTCTCTTATT

>WP_005681510.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004289235.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004311292.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004289233.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004311295.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004293828.1_b_theta
ACCTTATACCTTAACCAGAGTACCCGTAA
>WP_004311297.1_b_theta
AAAGAATACAAGGATATCCCTTCTGTCGGGGGTATCCCTGTTATTTTTCTAAAATCCACACATCA
>WP_004293827.1_b_theta
AAAGAATACAAGGATATCCCTTCTGTCGGGGGTATCCCTGTTATTTTTCTAAAATCCACACATCA
>WP_005681511.1_b_theta
ACACCTGCGGGATATACGGGTCGTAATCAAATAAGGAATAGCTCATCGGCCATTCAATATTTGAAGACAGACATATTCCCGTTCGTGTAAGTCCGCACGAACGGGAATTTTTTGTGTTTGTTAATAATATGTTTTACATTAAAATGAGTGACCTA
>WP_008766350.1_b_theta
AATAGGTAATAGGTAATAGGTAATAGGTAATAGGGCGTGGAGAAACTAAAACAAGCAACAGAATAATAA
>WP_008766368.1_b_theta
ATAGGTTATAGGTAATAATGGTGGGGGAGGTTTTGTATTAGTTTAGTGCGGGGGGAGCTGCTGCGAATTGTTGTGGGCGTGGTCTACTATGAACTGCTGCGGGCGTTGTCTCCTGCGCTAAAAAAACTTATATTTCAAAGAAGATTGAAACACGTACTTCTTTGTTTTGTTTACATTTACAGACAAAATAAGGTTCTA

>WP_008762717.1_b_theta
TCTCTACTCCATCTTTACTTATAAGATAACTGCTTCATACGGTGAAACCGATAGTTCCATGCTTTGGAACCAATCGTTTCACTGTATGAAACAAGAGTTTCCATGCCTTTGGAACTAAAGTTTCAAGTCACTGGAACTAGAGTTTCAAGCTGCTGAAACTAAAGTTTCTCACTGATGAAACCACAGTTTCTCACCGATGAAACTTTGGTTTCTCGCCGATGAAACTTTAGTATCAAGGCGCAGAAACCGACCTTCTTCGCCGTAAGCGTCTCCGCGATGCCACAGTGACAGCAAAACTACTGTCACTCTGCTATCACACCTTGCCGTCACCTCATAACAATCACATAATCATCTATTTGCCCCCATTTATGTGACAGATGACAGCATATTTTTATTTTTTAATTTATATGTAGAGAGTATATATTACTCCCCCAATAATAATTATATAATA
>WP_008762721.1_b_theta
GGCATAGAATCCTTAGATTTGCAATATGACAAAGGTAAACAAAAAAACTTTCCAGCACTTTTTTGGCGCAGAATATATGTAAGAATGCGTGCCAGTTTGGTTAATTATAGCGCCACAGGGCGATTTAGTGTTTCGACTTCCGGTAGCGGCATTCCTTTTCTCTCTATTTTCGTCACAGAAACTAATTTTATTCATAAATTCATTTGTTCA
>WP_011108971.1_b_theta
ATTATTGGAGTGGTATGGCAACAATTAGAATTTTTTCGAATTTTCATTCGAAAAAAATAATACTGAGTAATTGGAGGATTTTTCTCTTTATCCCTACCAGTAGCAGTTTTAATTTATGAAATAGAAAGATATGATAGTGTCGCATTTAGTTTATATATGAGTAAATGTCTATGCCAAAGAAGTAAAAAAGGAAGTTCGTTCTCCAGTTATTGTTTTCTTGATTTTTATCTTGATTTTAATAGCGGGGAATTTAACCTTTAAGGAG

In [5]:
gene_list = pd.read_csv('/home/nastya/Downloads/table3.csv').wp_gene.to_list()

In [6]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', gene_list)

>WP_011107050.1_b_theta
TAGGTGCAGGTACAACTTTTGACCACTGTCAGGCATACTACGGTGAAGACGATGCATTCGAATTCTTCGGTGGTACAATAAATGCCAAGTAT
>WP_011107058.1_b_theta
AACAATCCTTGTTGATTGACAGAGATTGGCCAGTGTAAGTTTAAACTAAAAAAGATTGTG
>WP_055235382.1_b_theta
AGATTATAGAAATCTGTATAAATTTAACAAA
>WP_011107089.1_b_theta
GGTATATAGGGGAACTTATCACAATTTGTGATAAGTCCCTTTATGCTTCCAATTACTCACCATTAAATAATAACATT
>traN_b_theta
TTTATTTATCAACCATAAAAATTTCAATCCAAAA
>traM_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>traK_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>traJ_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>WP_008766240.1_b_theta
ATTTTCTTTACTTTTGGGCACAAAGGTATTGTTATGATGTTACAAACGTATGTATTTTTTCTGAAATAAAAAAATAACCTTCTTTCTTATTGAGTTTCCGGCCGAGAACTTTGTGTACTCTTTAATTGTTGTCCTATTATAAAAGTGCGAACAAAGCACACAACAAAGTTCTATAAAAAAGAAAGGAAGATTA
>WP_008766208.1_b_theta
AGAAATAAAAAATGGATTGTCCGGGTAAATGTTAAATTGATAATTAATAATATGTTGATTAATAGGTTGTTACTTTTCATTAACCTCTTGTAATTTGTCCGGGTTGATTTTTTTGTTTTTTCGAATACATACCGTACTTTTGGCGAAACAAAAAAGCAATATAATT
>WP_008766207.1_b_theta
AATAAGGAACTAGACCTTAACTTTTA

>WP_011107544.1_b_theta
CATTACTTGTATCTTTCATAAAAGCCTTATGAAATGGTGATGCAAAGGTAAACATAATTTTTATATCTCAAATAAATTTCTTCAAATTTTAGTAGAGGAAAATCTATATTATAATTTCTATTCCAGATGAAAAAAAGAATCGGATATGAAAAATCGTATATTCGATTGCTGTATAATTTAAAATATACAGATAGAAATAAAATGATTCGATAAAATCCTCTGGATTTCTAATCAGATATTTTCAGAACTTAATATTATTGGATTTACTTCTTGTCAATATGTGCGATTTTATCAGTTTATGCTAACGATAAGTTAATTGTATGCTTACTAACTGATTCAATCTGAAAGATTAAAATGATTGAGATATTAGTTGTATCCGGAACAAATAATCTGTTTCACTTGTGTTTTATTCATTCCTTTTAAGGGAGCAGAACGTTCCTCAACGACCAATGTCCCGTTGGTTTAAAACCAAGGAGTCGTTCCTTTAAAAGGAAATAGCTCAAATAACGTTCTTTTTCTGTATAATCCGATAATATATATGCAAAAATATATTGAAATTCTGTATGTTATTACGCGTTTTTATGCATAATTTCTGCTGTTTGATGAAATATGTATGGGGCTATTCTTTGTGTAATCTTAAATATAGAACATTTTTTTAAGATTCTTTCTCTGTTTGTTAAAGTGAACAAACCCGTCCTCTCATCTGTTAGAAAGTCTCAATTTTAGATTTAACCTAACCAGC
>mgtA_b_theta
GAAACTCTAATAGTTTTCTAATACCACTCTAATCATTCTCTAACGGCTTCCTCCCGAAGTCCTCCTTACTTTTGCATCGAATTAATAAATGCAGTTAGTAACTTAAAAAAAATAGAGAGTA
>WP_162303160.1_b_theta
TGAACGATTGCTGTAAGCCGCTATTTAGTTTCAAGGCTTGAAACTATTAGTTTCATGCCTTGAAACCATTTGT

>ccsA2_b_theta
ATAAAAAGGTCCTTAACTTCGAAAATATCTTAGGTTTAGGAAGTAGCTATACACTTCTGAAGAGTAGGTATGTCTATATACAAACTTACCCAAGGATACGTTAATGTTCGTAAACTTGGGTAAGATTTTATAGATATTGTTTGGAGAGGACTTAGAATTCACCTTCTTTATTTTTGGTATTACAGATTTCACTTTCCTATATCAATCAGCTTTTTATCAGACAGCTTCAATTTTATAAATCTCCACTTTCCACTACCTCTTGTATCGACTACTCCCGCTATGGTAAGTCCAATATAAAACACTGCGTTTTACAAGACATAAGGCTATGCTTTAGGGAGTGAAAAGCATAGCTTTATACCTTGTAAAGCGTAGCTTTAGCTTCCATATCCTCTCTCCTTTCACCAACAACTACACTGAAGGATTACCCTAAACTCATATTAATATCATTATCTACTTGTAACGATTACAATTTTATAAAGATTTTATGTGAATATTCATTTTTATTTCTTTACTTTGCAAAAGATAACAAAACAGAAGCACGAAT
>WP_162838073.1_b_theta
AACATTTATCGTTAATTCTTTTAATAATGCGGTTCTGTTTTTGAAACGCAGCGCAAAATTAGGACAAAGTGATATAACGGCAGTTCGGACATATCCGAACTAATGTGAATATTTTTATAACAACTAAATTATTATTAACGTTTAAAACAAGAAAAAA
>WP_008767888.1_b_theta
TTTTTTCTTGTTTTAAACGTTAATAATAATTTAGTTGTTATAAAAATATTCACATTAGTTCGGATATGTCCGAACTGCCGTTATATCACTTTGTCCTAATTTTGCGCTGCGTTTCAAAAACAGAACCGCATTATTAAAAGAATTAACGATAAATGTT
>WP_011107913.1_b_theta
TCTTTAACCAAACTATAATTAATAATTTA
>WP_011107914.1_b_thet

>WP_008766471.1_b_theta
ATGTAACGATTAAAAACGAAGTATTA
>icd_b_theta
TTTGTTAAACATTCCTCGAACGCCGTTCGTTATTTTTATAGATAGACTAAAAAAAATAAAAGCAATA
>WP_008766479.1_b_theta
TACCCTATAAGGCTTTAGTTTCCACAAAAGTAGAAAATTCTTTTTTATCAAACATCCATTAGTGAACAAATAAAATTGCAGATTGTTATAAAATCTTCCAAGAACCGAACTTTAACAGTTAGAATAGTAGAAAA
>WP_008761029.1_b_theta
ATTCCATTCGACCAATGATCTGATAGCAGCTATGGAAGGCAGAACAGAGCAACTATAATGAAATAACTAATGATTTAATAATAATGAAAAG
>WP_011108127.1_b_theta
AGTTGTGGCTTTTTTAAAATTACTCTATGCAAATATAACAAAGTTTTTGATAAGAAGGTTGTGAATTCAAAATATCTTCGGAAAAGGAAGCAAATTGAACCTTTTTTCCTCTTTTTAGTTTATAAATGACGATTATTTAATAACTAAACCCGAAAATTA
>WP_008761025.1_b_theta
TTGCCCTCCTTAGTTTTTATGATAGGTTATGTACAAAAATAATGAAATCCCATTTGAAAAGCAAATAATGTGTAATGTTTCCTTCCATCCGCAGGCTTTTTTAACTCAAATAAGAGCGGATAAGATTTTTTCTGAATAAATTCCCTTTTTTCCCTTAAATTTTTCTCTCTCTCAGGCAATATACGTTGTATATTTGTTATTATAGATATAAGAGTATCAACTATTTAATTTGGAAGTAATA
>WP_008766603.1_b_theta
TTCCTCTTTTCTTTTGGGGTGGTTTCTGTTGCAAAGAAACGTATAAATAAATACAAATAGCAAAGTATTTGAGGATTTTAAGTTGAAAAAACTGCAAATCGGACAATTTATAGCTTAAA

>WP_008765180.1_b_theta
TTTATACTACATCCGTTAATTCTTTATTCATATTCTTCATTCCCTTTCCATCGCCTAGCCGGTCGTTCTCCGGACATCCCGGCTCCTGAGTCACCGGAGCCACGTTGGGTTCTTGAATCTACAAAGGTATGGATATGATATGAAAAGGACAATCTTCAATCGGAAAAAGATGAAAGGAGAAAAACAAAAAAACATCTGACCATTTCTTTAGTTGAAGAAACTTATTAACGCACTTTGAGTTTTTTGCTTGGCAGTTCATTTGGAAACACATACATTTGCATCGTGATCACAAAACAAAAGAAACAGATTTACTAACTATTAAAAAACAAAAAAGATT
>WP_011108356.1_b_theta
ATACGTTGCTTTTTAAATTGTGAATATTCGTTGTTAAATTTCGATGATGCAAGTTTACGGGATTTTTTTAAGTACGGCGGGGGTGAAACTGACGGAATTTAGCCTAAAAACTGTTTGCGGATAGTTTTTAGGCCATAATCTGATAGTTTTTGGGTGTTTTTCACCCGTTTTTTGAATATTCACTGTTGCCTCATTTGAACTTTATAAGGTAATCATTGTTTTCATCAGACCTAAACTTTTAACGAACCGGTATA
>WP_011108358.1_b_theta
AACCTTAACAACACAATTCAATAGGTATCAGATAACTATAAGCTGCCAGTTGTCAGCGAATAGCCAAAGGCAAGAGGGAAACAACCCCTTCTTGCCTTTTTTATTTCCGTTCCCCCCACCTTACGCCTGCCCCAAAGCCGCCCCTTAATAAACGCCAAATAATGTAAAAGCCGAAAACTTTCAAAACCATCTGCTGTTGTTGAGGGAAATTCGACGGGGAAAATCCCCATCGCCCTAAACCTTTAAAAAAACAAATTAGT
>glsA_b_theta
AACCTTAACAACACAATTCAATAGGTATCAGATAACTATAAGCTGCCAGTTGTCAGCGAA

>WP_011108422.1_b_theta
AACCCTTAAACTACAAATGAA
>WP_011108425.1_b_theta
ACGGGGAGAATTCTCCCCGTTACCCCCGATTACTGTATGGAGATCCGCGCGGGAGAGAAAATCAATCTTACGACAAGGGAGAATTTTGATTTCCTGTACCGATCGGCACTCAGATACAGCGCACTTGTCGGCCTCAGGCTTCCTTTCCGTCCGACCGGGAAA
>WP_011108426.1_b_theta
ACGGGGAGAATTCTCCCCGTTACCCCCGATTACTGTATGGAGATCCGCGCGGGAGAGAAAATCAATCTTACGACAAGGGAGAATTTTGATTTCCTGTACCGATCGGCACTCAGATACAGCGCACTTGTCGGCCTCAGGCTTCCTTTCCGTCCGACCGGGAAA
>WP_004323231.1_b_theta
AAGCTTTTGATTTTTTGTCCGATGTAAAGGTATACTGCCGGCATGTTTGTCTCCAAGTCTCTGACTACAGGTGTCAGCAGATGTCGTCAGATTTCACGGATAGCTGGTCGAGTTGTAAGTCGTATTTACAAATTGTACCCGGACTTTACAACATCCGGGATTACATTTTACAGTGCTTTTTGTAATTTTTTATTTTATATTTCTTTGTGAATTAGAATGTTATGTAAAAAGATGGATTCGTTTTTTTGAATCCGGCATGTAATTTTCATTATAGTGTAATATACTCTTGTAATGCGCACAGGAAGAGTATTCAGGTAATCACTTTTTAAAACAATGGAGGATATGCCA
>WP_011108431.1_b_theta
AAAAAGAAAGGAAAGATTTCCTTAAATAACTTTATACAGGAAAATA
>WP_011108432.1_b_theta
CAGGATCTAAATATAAAAACATAGAAATTATTAATTCATTCAAAGTTGCAGATTA
>WP_011108433.1_b_theta
CAGGATCTAAATATAAAAACATAGAAATTATT

>WP_008765256.1_b_theta
ATCTTTCCCTTTTTGATGGCAACAAAGCGATTATACTCCATTATGCTTCATAATTGATTCAGTATGGTCTAATATAATCACTTTATTACCAATGTATTATAATGAATAAAATTAAAGCCTGTTATGTATATGCAGGGTATGAAACATAATATATAGTATATGGTTGTACATAACCCTCTATCAATCAATTGTATATATACAAAATAGACATTATTACTCTCTATTACAAAATAGAGAGTAAAGAGTATAGGGGAATTATGTATAGTTAATGTGTTGTATGTTAGATGATTATGCGACTAATTTGCGCATAGAATAGTATGGCTGTAAAATCAGGCCGGATGATTCTTGTTTAATGAAAAGTAGAAAACGACAGCACTCGTTTGTTGGAAAAAGAGAATTATGGGA
>WP_008765255.1_b_theta
ATCTTTCCCTTTTTGATGGCAACAAAGCGATTATACTCCATTATGCTTCATAATTGATTCAGTATGGTCTAATATAATCACTTTATTACCAATGTATTATAATGAATAAAATTAAAGCCTGTTATGTATATGCAGGGTATGAAACATAATATATAGTATATGGTTGTACATAACCCTCTATCAATCAATTGTATATATACAAAATAGACATTATTACTCTCTATTACAAAATAGAGAGTAAAGAGTATAGGGGAATTATGTATAGTTAATGTGTTGTATGTTAGATGATTATGCGACTAATTTGCGCATAGAATAGTATGGCTGTAAAATCAGGCCGGATGATTCTTGTTTAATGAAAAGTAGAAAACGACAGCACTCGTTTGTTGGAAAAAGAGAATTATGGGA
>WP_008765254.1_b_theta
TCCCATAATTCTCTTTTTCCAACAAACGAGTGCTGTCGTTTTCTACTTTTCATTAAACAAGAATCATCCGGCCTGATTTTACAGCCATACTATTCTATGCGCAAATTAGTCGCATA

>WP_008767254.1_b_theta
TCGGAGACGTTTCTGTATGTAATCTGAAAAATGTTGAATCTAAAATATAAAAGCCTATGAATGAAATGAGAAAGAATCTCTTTTAAATGGAAAACAAGTATTTATCATTAACTAATCTAATAATTTA
>WP_008767255.1_b_theta
TCGGAGACGTTTCTGTATGTAATCTGAAAAATGTTGAATCTAAAATATAAAAGCCTATGAATGAAATGAGAAAGAATCTCTTTTAAATGGAAAACAAGTATTTATCATTAACTAATCTAATAATTTA
>WP_008767256.1_b_theta
TTGTTTCATTACCTGACTTCTCAAGTATTAATATCCGGCATATCAGCTATTAATATCCGACAATTCAGTTGTTAATACCTGATATATCAGCTGTTAATACCTGACAAACCAGGTATTAATCCCCCCAATGTCTGCCTATAGGTTGCTATACGTTGGATAACATCATTTGTGCAGAAAAAGAAGGGCGATGTATCCAAATCTTCCATTATAGGTGACCAAATCTTCCATGTGGATGCAGAAATGTTCTTTTTGCAAAATTTGTGTTTATGGATGTCATATAGGTCTCCTATTTTTGCAATGCTTTCATTCTGTTTTAATACATAGAATCAAAGACAATAAAATAAGTATGAAAAACACAATAATTTATCGCA
>WP_011108548.1_b_theta
TTTTACACATAATTATATTGCGGGTAGCAAAAATACAAAAAAAACATAATTATTGGTTAAGAATCTCTTTAATTTATACCCTTCCAGCCGAAATGCAGTTTATTCATTTGACGATTCTTGCTAAAATACAGTCCCACAGTATTTTTTCAAGGAAAAATCAATGGTGTTTTTAGGTTAAAACCATATCTTTGCCACAAACTAAAAAGAATAATTGGATGTTAATCACATTTACATTCCGCAAGGATCTCGTTCTGTAAATAATCTGTAAGTCTTCCT

>WP_011108657.1_b_theta
AACAAGGCTTTCAACTTTTCTTTTCACAGTTCAACCACATTATTATATTATAAGGACAACAACT
>WP_011108662.1_b_theta
AGAATTAAGGTCTGCGACCTTGGGCGTGGATAGCTGAAAGGCAGTAGTTCCAACTAACGTGTTCTGAGAGGGAGCGAGGTTATGTTTTGGGAACCCCAAAACGCCTCGCTCCACCATGAGGGCGGAGGAATTTTGCTCCCGACGGTCGCAGAAAGTGAGTGTACCAACTGTAAACAGTGTATCGA
>WP_162303095.1_b_theta
TAATAAATTAATTTAAAGGAGTACGTGCAAAAGTACTCCTTTTCTTCCTTATCCATGTTCTGACTCTTTTTCAATTTTGTTCAATAGGCTTTCAAAAATCGATAAAAATGCCTTGTTAGCTCCTTCTTGACTCGGAAATGAATAAAATTGTGTAGTAAATGAGTAGTTGACCATTAGCAGGTTAGTACAGACACTTCGATTCTCACCGAAATGAGAAAAAATTCTCACTGCAATGAGAAAATATTCTCATAGGCATGAGTATTTTTTCTCACTGTGGTGAGAATTTTTTCTCATCACGATGAGAATACTTTGAGAATATAGTAACCTGTATTGTGCTGTCTATTAACTTGATAATGATTGATAACGTTCTGTCTTATGCCTGCAATTGATTGGCTGCAAAACGTCCTGTTTCTAAAAAATGCGGGTTTTTGCCTAAAAAACAGTTTGTTTGTCACTGCACGGGAAGTGAATCACCATCTATTGGCATTTGTTTTTGATCACTTCTCTATCAATAATGATCATTTAGCGGATTGCACAGGGTTCTTTTGATCATTATTGAATTTTAATTGATGATTTTTGAATGTTTCCTTTTATGCTGTGGTTGTATTTTTGTCGAAGATTGAAACTTATTGTTAGCTAGTGAAAGTATGAATCTTTATTATTAAATGTAGTATT
>

>WP_008766981.1_b_theta
AACAGATTCTAGTAAACCAATTAAGAACACTAGAGTGTTTCTGAATAAACCTAAATTATTAACTAAAACTTTTGAATTATGAACAAGAAAACTGAAAAGAAACAGGAAGAGGCAAAGGGCACTAAAAAGACCGCCCAAGCTAAAGAGTGCAAGTCTTCGACTTCAAGCAAAAAAACTGAAAAGAAATAGGCATTCAGAATCGTTAAGTTTGTGTCATTCTTCAGGCAGACAGGAATCTGCCTGAAGAATCCACCAATTTATTCTCAAAGTGACTTTACAATTTTAAATTAAAAGAATA
>WP_008766979.1_b_theta
TCCTTTCACAAAACACTCAAAAATATATTTGCAAAAATATGAAAATATTATTTAAACAGCAACTGAATAAATTAATAATTATATGTTAAAAACAGAAACTTAAAAAACACAACTATCTCATCTATCGGAAATACCGCATAATACCCCAAATACTCCCCAATAAAAACATCGGAAATAGATAAGTGTTCTTTCTAAATTATATTAGTCATTTTTTGTACGATTTTCCAGCATCAGATCATAAATATACACGAACAGGAATCATCTTATTATTAGGACAATTTCAATCATATATTTATTCCTTCGGCTATGCATTTAAAATCCATAAGCCTCTTTTTCAATTTTCCATAAAATAAAAAATGATGTTAATTCCATTTTCAAACTTCGAACTATTTTTTTTAATCCCGTGTTTTTTTTCATACAGTAGGAGCTTAAGTACATTTCGGATGATATAACAAAAGTAACATAATAAATCGGGGAAGTGTAATAGAGTAAACCTACATATTTTTCTAAAAAAAATAAGTTTTAAACATTGTTTCATCAATAAAATCTACAGCCATGAAAAGTATAAGTTCAAATTTATGGATCAGATTAATAGTCAGTTCTATCATAGGAACAATAATTTATTGTTTATCAAGTATGAGACAGCAAACAAG

>WP_011109179.1_b_theta
AGAGATTTTCTTGATTTATATAAAATTCCAGTATCGCAAGTTACTGGAATTTTTTTTATGTAATATACCCTTTACTTCTTATTACGGCCTATTCGCCAGAGATTGTTGGTGAAGGAGAAAATATTATTATATATGGTAGAAATTAAGGTGTGCATTTTATTTCTTATGCATTTATTCTTTTGGGGATCTACTACCTTTTTTATTTAGTATATTTGAAAATAAATAGTTGTGCTTGGCTGTTATGATTCATTTTTATATTTAACAGATTGGGATAAGTTGCTCTAGAACTTGGTAGTTTTATACCAATAAGTGTATTTTTTCCTGTTTTCATAGGCTATTCAAAGGTTATTAAGTTTGTGTGCTACTTAAGACGGTGTAAAGTTATTAATAAAATTGGTGAACGCAATGTTTTTTTTATTTTAATTATCGGCAAACGATGTTTTTTTTAATTTTTCATTCTACTACTCAGATCGTAATAAGATAGATATAATCTTTTAATTATTGTATATTGTTGATTATTAGTATTTTATCGCTTTGGATACCAAGTTCTAGAGCGACTTATTCCAATCGGTCAATTACGACTATATTTCTTAATAATTATTTATGATAATGCGAAATTATAGGCCGCCTCCGGGCTGTGATTAAATTTTTATTATCGTTTCTTCAGCATAGATCAGAACAGTGCTGAAGAGAGATTGAATTCATTAAAAATTTCAATACCCCCAAGAAAAACAAATGAATGATAACAGTCAAACACAACTATATATATTCCAAAACTTACTAAATATTAAGATAAAGCATACACACCCAAAAACAAACGTATTAGAGTAAAAATGCACATCTTATTTCTACCACATTATCATAATGCTAGATATAGAATAGTAAGTTTTCTACTTATCCATATTAACCCAACTAATTATTAACCTTTAATCACAGAAAAACA
>WP_011109180.1_b_theta
ATCACATA

In [7]:
gene_list = pd.read_csv('/home/nastya/Downloads/table4.csv').wp_gene.to_list()

In [8]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/genome_foot/b_theta.fasta', 'b_theta.gff3', 'intergenic/b_theta_intergenic.bed', gene_list)

>WP_011107050.1_b_theta
TAGGTGCAGGTACAACTTTTGACCACTGTCAGGCATACTACGGTGAAGACGATGCATTCGAATTCTTCGGTGGTACAATAAATGCCAAGTAT
>WP_009039979.1_b_theta
GAATTTATGGAAAAAAAGAAAATGCAGGCGGTGGAAAGCGGTAGCCTGTACATGACACTGGACGCTTTTTACAAACCCGTGTATCTGAGTATACGCGGGCAGGGGGAAGATGGATTCTCTCGTTACATCGGCAACAACGTGAGGTTTCATACGGCGGGACGGAGATTTGTTCCCGACCACTGA
>WP_055235382.1_b_theta
AGATTATAGAAATCTGTATAAATTTAACAAA
>WP_009039948.1_b_theta
AATGCTTACACTGAAAGGACAAAGGCATCTTGTGATCCCTTACCGGAGTGCCTGCTTTAGAAAACGGATTCC
>WP_009039946.1_b_theta
AACGATGATTGATAAACCCTAAATAAAGCAAAGAAAGCA
>WP_009039944.1_b_theta
AGGCAGCTTATCACTAAATAAGGATACATTAACTTTTTAAAACATTAAAGTAT
>WP_011107089.1_b_theta
GGTATATAGGGGAACTTATCACAATTTGTGATAAGTCCCTTTATGCTTCCAATTACTCACCATTAAATAATAACATT
>traJ_b_theta
TCAATTAACAATTTAAAATTCATTCAGTA
>WP_008766747.1_b_theta
ACTTCCATTTCTTATTTACGTTTGCTACGTATTTAATACGAAAATAAGGAATAATTTACCTAACTGGTATCAGAAAACCCGTTTTTATTTTCACCGGTACGGACTGCCTTCATACTCCGGACATTCAAGTATACTCAAAAAGATAGCCTGCAAGCATACCACTTACAGGCTATCACAATCCCCTTTTTTCCCTTTGTATCAGGGAGGCAA

>WP_008765563.1_b_theta
ATAAACCGACCAATATGAAACGATTAAAGATCACAATCCATAAATTATCAAAGACT
>WP_008765565.1_b_theta
TAAATCTACATTGTTTAGATGCAAAGATACTTTTTCTTTGCATCCGAAGAAAGAAAAACGGCATACATTTAAGTGTTTCTCCTTTCTGAACTGTATTATATATAAACAGACGATTATACT
>atpD_b_theta
AAAGGTTTTTTTTACCATAATTTTTAGGATTAATAATGCAGCAAATTTCTTCTTTTTTTATCAGAAGAGTGTTTTTTGCTTCTTATTTTTTCTGCTGACATTTAACAAAGTGGCAAAATTAAGGAATTTATAATTTTCTTGCCGAAAACTTTTCCGCTTCAGGCAAATGGGGAGTGTTTAATAGGATGTACGTGCCCGGATATAAAACTAGTTGCATATGCTACTATTTTATCGCTCGTTTAAGTGTTTGAATATTACCGTTGAATGGGAAAACTTGATGTTAAGTCATAGTTTTAATGATATTTTATAATTAACAGGCTTGTTTTACTATACAATCGTAATTGGAATGTTCCGATATTGGTCTGTTTGTAGTCTTTGTTGGCCTAACAAGTGTTATTGAACGACAAATAATGAAAAAGGAAGAACAACGATGCTAATTTGTTTCTACATTTGCGCGGTTGTTTATTTAACTATTTTGTTAAAAAGAAATCGGGCATGATTGAACAAAAGCTAAATTTAACTGTTTACTATAGTAGATTAAAGTAGATAAAACGCGAAACGTTCCTTGTCATGCTACTAAAATTCGATAGGTAAAGAAAGAATAATATAAATAAAGATTGCTT
>atpC_b_theta
AAAGGTTTTTTTTACCATAATTTTTAGGATTAATAATGCAGCAAATTTCTTCTTTTTTTATCAGAAGAGTGTTTTTTGCTTCTTATTTTTTCTGCTGACATTTAACAAAGTGGCAAAATTAA

>WP_011107544.1_b_theta
CATTACTTGTATCTTTCATAAAAGCCTTATGAAATGGTGATGCAAAGGTAAACATAATTTTTATATCTCAAATAAATTTCTTCAAATTTTAGTAGAGGAAAATCTATATTATAATTTCTATTCCAGATGAAAAAAAGAATCGGATATGAAAAATCGTATATTCGATTGCTGTATAATTTAAAATATACAGATAGAAATAAAATGATTCGATAAAATCCTCTGGATTTCTAATCAGATATTTTCAGAACTTAATATTATTGGATTTACTTCTTGTCAATATGTGCGATTTTATCAGTTTATGCTAACGATAAGTTAATTGTATGCTTACTAACTGATTCAATCTGAAAGATTAAAATGATTGAGATATTAGTTGTATCCGGAACAAATAATCTGTTTCACTTGTGTTTTATTCATTCCTTTTAAGGGAGCAGAACGTTCCTCAACGACCAATGTCCCGTTGGTTTAAAACCAAGGAGTCGTTCCTTTAAAAGGAAATAGCTCAAATAACGTTCTTTTTCTGTATAATCCGATAATATATATGCAAAAATATATTGAAATTCTGTATGTTATTACGCGTTTTTATGCATAATTTCTGCTGTTTGATGAAATATGTATGGGGCTATTCTTTGTGTAATCTTAAATATAGAACATTTTTTTAAGATTCTTTCTCTGTTTGTTAAAGTGAACAAACCCGTCCTCTCATCTGTTAGAAAGTCTCAATTTTAGATTTAACCTAACCAGC
>WP_011107545.1_b_theta
CATTACTTGTATCTTTCATAAAAGCCTTATGAAATGGTGATGCAAAGGTAAACATAATTTTTATATCTCAAATAAATTTCTTCAAATTTTAGTAGAGGAAAATCTATATTATAATTTCTATTCCAGATGAAAAAAAGAATCGGATATGAAAAATCGTATATTCGATTGCTGTATAATTTAAAATATACAGATAGAAATAAAATGATTCG

>WP_008764996.1_b_theta
AAATTGATATTGTGGTATATATTCGAACTGCAAAGATAGTAAGGAAACACATAAGTACGATAAAAGGCTCAAATTTTCATCTTTTCTTTTTCCGTTTTAGCTCTTTCTTTTTCCGCTTCGGAAATTAAATTCCGGTTCTTTTTTCTTCTTTCCGTTACTTTCGTGCCGTTGACAACGTAACTAATGTAGAATAGAA
>WP_008764995.1_b_theta
AAATTGATATTGTGGTATATATTCGAACTGCAAAGATAGTAAGGAAACACATAAGTACGATAAAAGGCTCAAATTTTCATCTTTTCTTTTTCCGTTTTAGCTCTTTCTTTTTCCGCTTCGGAAATTAAATTCCGGTTCTTTTTTCTTCTTTCCGTTACTTTCGTGCCGTTGACAACGTAACTAATGTAGAATAGAA
>WP_010538536.1_b_theta
TCTTACTGAATGAATGTCAGACTGTTAAAGCCATCTGACATACAACGAATGTAAATTTGTCAGTTGCTGTCGTTAAGCGCTGTTAAGCCCTAAATATTCTCTGTTAAAAGAGGACAAAAAAGAGTGACAAGTAGAATAAGTGAACGATATTTGTTATTATTTTAACGTCGTAAAAGTTAAAACAATGTCGAATATTAACATTTTAAAGAATTATAAATCA
>WP_010538562.1_b_theta
TATTGCATTTACATTTTAAAATTACTTTATAATAAAAACAACTTATCCCCGGAATATGTTCACACGAAAACCTTTTCTAAAATCAATTTGTTATTCCTCCACAAATTCACAGAGTATTCATTTAAAATTTAAAACGT
>WP_172461650.1_b_theta
GAATTGTAACTATGTAGTTGGGAGGGCTGATGGATGCCGCATGGTGAAAATAGGCGCGGAGTTCAGCAAGGAAGAACGCAACGTGAGTCGCTGGCTGGTGGAGGAAGGATAAATATGATGTATGTTC

>groL_b_theta
TTCAAAATTCTAAATTTATCATATTAAATAAATAAGATATCATTA
>WP_010539043.1_b_theta
CTTGGGGTTCGTTTACGAATATACGTTATTATAAGGAATAATCCGCTTCTATTGTTTAAAAAGTGTGGAACAAATATGCTTTTTTGCCATTAATAACCTGTTGTCAGTCATTTTGTCAGTCAATTTCTGCCAAACAATCATTATTTTGCTTTGGCATACTTTTCGCAACTGATTTAGCGTCTTCTTCAAGAAGAGGACAAATCATGAATATAATGTATAACAAATAAAATCAAAAAAAAGAACTA
>WP_008767711.1_b_theta
GAAATGTTATTCTATTCCTAGAATAACTTTGTTCTATAACTTAATGAGAGGCGGAGAAATGGAAATGACTCCGCCTTTTTGTTTTCTCTTTACCCCCAATATTAACTATAACGCATTA
>cysK_b_theta
ATCTATAAACTTAAAAATATTGAACGGAGACAAAGGTACAATTCTTTTCTGTATCTTTGCGCATCAACCTGATAAAACGAGGTGAATGA
>WP_008765470.1_b_theta
GGTCATTCCCTTACATTATTCATTAAAATATAGTATAAATTAAATATAAGTTCTA
>WP_011108036.1_b_theta
TATCGGAGACCTTGCATATTCAGTATTGAATGTGTTGTGCTCTCTTACTATATATAAAATTAACCGCTGTGCTGAAACAGCAAAAATAACATTTA
>WP_016269667.1_b_theta
TATCAGAATAAATAAAAGCAGAAGATAATGTCTTCTGCTTTTATACAGACTATTACCGTTTCATGATTCAACGGAGGAATTTCTGTTATACGGAGAAAACGAAAAATCATTTTACATTTAAGTTCTGCTAATTCTGCCTCTTTTATGCGCAATATCAGGAGTATATAATCCTGCACTTTCCATTCTGACAATAGATTAT

>WP_225011966.1_b_theta
TTAATGGTTAACTAACAAATGAAAGAAATAATTA
>WP_011108167.1_b_theta
CCTTAATAAACAGACGAAGAGTA
>WP_011108168.1_b_theta
CCTTAATAAACAGACGAAGAGTA
>WP_172461666.1_b_theta
AAACATATCGGTTAATGATATTTATTCGCTTATTAATCCGTTATTAACCCATCACTAATAAGTTGTTTAACCTTGCTACATTATGTTTGCAGCGATGTTATTTACTTATTGTTTTATTTAAAAACCTAATACGTA
>WP_007667366.1_b_theta
ACGGTTAAAATCAATTTATTAATCAAGAAAAAACAACAAGCA
>traJ2_b_theta
GAAAAAGGAGAACTCCGGTTCTCCTGTAAATGGAAAGAAAACTATCCGATTTATTTAACGTTAAAATTTTAAAAGACA
>WP_007485384.1_b_theta
AACCCGTATTACATAAGTATCAAGATGATAACTTTTTAAAATTCAGTATT
>WP_007485354.1_b_theta
TTATTCTCTTCCAAAGTACGACCATGCAAAAAGCTATGAATCAGCGAGAAAGATGCAAATTCGTGGGACTTTTTTCGATTAAATTTTAGTCCCACGATTTATCCACCGATAATATGCTTGAAACGGCTATTTTCTGCTACCTGCCAAAAAAAAGAAATCCCCGATAATACTTGATTATCAGGGATTTGCTTGATTTTTCTTTTGTTTTCAGTGATCCAGCTGGGGCTTTCACCGTAGCGGTCTATATCTTTGATTTATAAGCTCTTGTAAACCTTAAAAAATTGTACCGTAACGAATTAGTAACGATTTATAGGAACGCTTGTTGTCAGTGAGAGGCATTCGTTTGTCAGCCGTTCTCTGATGCCGCAAAGATAGAGCATTACTTTTTATTTTCCAAACATTTGATTATTAAATCTTTAAAAAAA

>WP_008765276.1_b_theta
GATCCGAGCTTTTGTCTGCTTTCGGATTCAGATTAATTTAATAAAAGAGACGTAGAA
>WP_162303085.1_b_theta
TGCAAAGAACGCATATTTGAAAACGCAGCAGGTACTTGACAAACGCTTTTAAATGTTGTATATTTGTAGATCAATAGCATAAACCTATAAAAAACAGATGAAAGAAAAAAGTAAAATCATATTCCTAAATGGTATAATATAGCAACAAGTATGGACTATCTTCCTGAGAAACCTTAGGTGATAGAGTACCAGATATAGGCTAAAGGCTGCAGACTTTATGAAGTTTGCAGCCTTTTTTGTCTTAAAAACTTAATGATTTTATTGCGAATTTCATTTGGGGAGTTTATAATAAACGGTGGGGACGTTTATTATAAACGATCATGGAATTTATTATAAACGTATAGGCTGGTTTATATAAACTTGTTGCTATTTTTTATATAACTACAATTGTCTTTCTTTGATAATGATTGTTGTTGGTAATTATCGCAGACATACTGCGCGGTAATCTAAGTAAACAAAATCTTAGAACTGCTATAGGGGCTATAAACCGGGTGCCTGTTTAATTCACTCAAGTCTCTCTTTTTTCGAAAAAAGAGGAATTAAATCTGAGCAGTTTTGATTTTGGATACATATATTGTGGCAGTTTTGATATTCGTATTTTGGTTTTCCGATCTATTGGTTTTAGAGAAATACATGTTAAATGTAGTTCGAAAAATAGAACATTAAGAAGGTAGGAAACATGCAAACGATTTAGAAAAACTATGTATTTTTGTGTTATAACTTTCAAAACAGTATAAGAAGCTTTACTCATTTTAATGCAGAATTGACTTGTTTGCGAGTTTAAGCACAAATAGTTATTTTTATTATATTGACGAATTAATGCTTGATTTAGGATAAAACAAACAATTTATTGTTGGAATTCTTTAATTATTTATATAAGGGCAACTAAAGAAT

>WP_008762898.1_b_theta
ACTATAAAAACACATTTCGAA
>WP_008767524.1_b_theta
ATCGATGTTGTAACCTATCCACTTTTTAGGCCTCTTTGGTTGATATTAACTTGCTGTTTATTAGTGGTTTGTATGTTTGTATAATCCACTTATTATACCCTCATTGTCAAATCACTTCAGTTTCTTTATACTTTTGACCTGTGCTTCTGAAATAGTGGCGAAAGTCCTGTCAAGCACATACCTTAATTATAAACTTTAAAATTAATACGC
>WP_011108760.1_b_theta
ATAGAGCCGGAGATAACATCACTTATAATGTATTTAAATAGAAAAAGAAACTAACGATT
>WP_011108761.1_b_theta
AGAACCTTAAAAGTATAAAGACA
>WP_008767539.1_b_theta
CAATAAACGAATAGAATAATAACTAATACATTATATCTA
>WP_011108771.1_b_theta
CAATAAACGAATAGAATAATAACTAATACATTATATCTA
>WP_011108772.1_b_theta
TCACAATATTCATCGATAAAAGCATCATAGATTA
>WP_011108773.1_b_theta
ACGGATAATGATAGTAACAGTTAGAAAAAAGAATTCGATTA
>WP_011108774.1_b_theta
CCAATGAGAATAGCAGATTTGTATTATTTTGTATCAATCTTGAATAATTCTATCCATATGAGAACAATCAATGCGTATATCTTTGCCATCGTAAATTCAACAAAATCACTTAATTTTTAATATATGAAAAAACATCTATCCAACCAGACAAGAAAAAGGA
>WP_032841405.1_b_theta
AGAACTGCCTTGCTTGTACTTTAGCCGGATATGTGCATATATTAGTCTCTTCCGGTGCATTAACTGTTATTCTTTATATCATG
>WP_011108783.1_b_theta
ATACCGTATACTAATAC

>WP_008767007.1_b_theta
TTAACTTTAAAAAGAACGATTCATCA
>WP_008767006.1_b_theta
CCAAGAGTTCATCCTTATATAAAAGAATGCCATTA
>susD_b_theta
CATTCATTAAATCAATTTATCA
>WP_011108938.1_b_theta
TAAGGAAAAGAAATAACAAAGATAGAGAATATACATTTTGGTAAAAAAGAGAGTGTTTTCGAGCACTAAGAACAATGACGAAGAGAAGGTTTTCAATAGGTATTTTTAAGAGAGAATGATAAGAGAAATTACAATTAACTTTAAAGACAAAAACA
>WP_008767003.1_b_theta
TAAGTTCACTTCAACTTATATTTATCAGAATACAGAAAAGAAATAAACTACTGTAAAACAAAACATTAACATTAATACAAACTTGTATTTTTATTCTAGACAACCCAAAGTCTCAAGATGGTTGCTATATTTGCAATATCCGGAACAGAGCGGAGTAAAAATATTAGTTAGCTTATAATTTAATACCATAAATAGAA
>WP_008767002.1_b_theta
AAGGTGCTTTTTTAAGATGTGACAAATATAATGATTATAGTAATAAAAACGAAAAATATCGCCATAACAATTTTCATTTTCACTTGTATTTTTCGACAGAGGCAATAACGATAAAAAGCTTATTATCAACAGCCTATTCAAAACAAGCATTTATATTTTCTTTCTAACTGCCATACGGCATACCCGATTCACCTACATTTGCATCATCGGTTCCCCCGAACAAATTTTAAAAATCTAGTTACCA
>WP_011108952.1_b_theta
ACGAGTATTCCTGATTAAAAGCTCTGTAAATCATTGATTTACGGAGCTTTTTGTGTGTTTCTACCGCCATATGCCCTGCTGATAAAGGGGCAATTACAACCTCACCGAATATATTTGATAATTATTGTCATTGAATTTATTTTTAATGTTAAAAAAG

>WP_011109009.1_b_theta
ATCAGTCATTTTTTTAGTTTGCCTCCCTCCTCCCTCCGGGCTGCATAGTTAATAGCTTCATTATCGAGTACGGGGAATCTTTCAAGCTTTGCTTTGTGTTGAAACGCACTCTCCTTTTATTAACATACGTTCTATTATACTCTACAAAAGTATTTCATTTATTT
>WP_011109012.1_b_theta
TGGACAATGGACAATTGACAATTAAAGACTGAGGTATTCAATCAATTTATTATTCGTAATTTAAAATTCATCAATAATA
>WP_008760823.1_b_theta
TGGACAATGGACAATTGACAATTAAAGACTGAGGTATTCAATCAATTTATTATTCGTAATTTAAAATTCATCAATAATA
>WP_011109013.1_b_theta
TGGACAATGGACAATTGACAATTAAAGACTGAGGTATTCAATCAATTTATTATTCGTAATTTAAAATTCATCAATAATA
>WP_008764161.1_b_theta
TATAGTATGTTTATCATGTTTCAAAATGCAAGTATAGCATATTTTCCAAAGAGAAGCATTTAAAAAAAGCTAATTCATTTTGCGTTTTCACAGAAAGCACATACATTTGCATCGTTGTTTATGCAATTATTGTTTAAGTCAACAAGTATAAAAAAATGAACTATCTCAATAAACAATTTATATGATAACGGGGTTATTTAAAAAAGAAGTATCACCTTATTAATATGTAAGATGCGA
>WP_008764197.1_b_theta
ATGCTATCCCCACTATCTTATTCTTTACTAATTGAGTTTTATGGATGTATTACCCGAATCCTTAGATCAGTAATTCCGGATAATTAAGTCTTTCGAGGTCCTTATTTATTCGTTCCACTATAGTTGGGCGCAAGAGGAACTATAGTGCTTCTTGCGTCCAACTATAGTGGAACGAAAGGGAAACTACAGTAGAACGAATGAATTTGGACTGAT

>WP_005818825.1_b_theta
ATTCCGATAAGGGAACAGATAATAAACAATAAGCAAAAGCGTGCAGCTCAATGAGTTGCACGCTTTTTGCATTTATGGGGTTCATGGTTGCTATGTGTTTTCTATGGGCTTTTTTATCTCTTATTTGCGACATGTTTGCGACACGTCTTATTTGGCGATAAGGTACAACTTAAATTGATATAAACCATTTAAAACGAGAAGAAT
>WP_004313935.1_b_theta
AGTGGCAAGTTTGTGTTTTAGTGTCACTAAAACCTATGCGGCAAGCGGATAGAACAGCCACTCTCAAAAAATCAATTTATGAACATAAAAGTTTAGATAAGT
>WP_008760377.1_b_theta
TAGTGAGACATACAGTATAGAGTCTTTTATAGCAATCGTGTTTTTTGTCTTTTTCCGAAAGAAAAATGCACCTGCCTTGTACCATATACCACATTTGGTATCGCACTATTATACCTCAAAACACGTTTTTTTCTTCTTTTATTGAATGATGACGGATTTTAAAATCATCCATCCAAACAACCTAAATTGTACAGAACCTATTCCATATCATATTACAATGTCATAACATTATGACACATAAAAAACTAGGATACAGGAGATAAACAGCTTTTTTAAACAAAAACAACATATGTCTCAACAAATTTGATATCAAAAAAATCATTTTTGAACAAAGTATGAACAACCTAAATTAAAGAATCTAATTAATGAGGTTACATTTGCAACAGTAAAATAACACTTTTAATAATTATAAAACCTAATAAAGTA
>WP_008760381.1_b_theta
TGTTATTGTGGTTTATTAGTTTTCTGCTGCCAAAAGTAGGCAAAATTTTTAATATTCGTGTAATTTTGTCAACAAAGAATCGAATTCGATTGTTATTTTAGTTAAGTTTGCACGGCATAGCAAATACGACAATAAAAAAATGTGAT
>WP_165450812.1_b_thet

In [4]:
gene_list = ['recA', 'dinB', 'recN', 'lexA', 'nusA', 'uvrA', 'uvrB', 'ruvA', 'polA', 'dnaK']

In [5]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/e_coli.fasta', 'e_coli.gff', 'intergenic/e_coli.fasta_intergenic.bed', gene_list)

>dnaK_e_coli
TAAGGATTCTCTTAGTGGGAAGAGGTAGGGGGATGAATACCCACTAGTTTACTGCTGATAAAGAGAAGATTCAGGCACGTAATCTTTTCTTTTTATTACAATTTTTTGATGAATGCCTTGGCTGCGATTCATTCTTTATATGAATAAAATTGCTGTCAATTTTACGTCTTGTCCTGCCATATCGCGAAATTTCTGCGCAAAAGCACAAAAAATTTTTGCATCTCCCCCTTGATGACGTGGTTTACGACCCCATTTAGTAGTCAACCGCAGTGAGTGAGTCTGCAAAAAAATGAAATTGGGCAGTTGAAACCAGACGTTTCGCCCCTATTACAGACTCACAACCACATGATGACCGAATATATAGTGGAGACGTTTAG
>dinB_e_coli
AATGCTGAATCTTTACGCATTTCTCAAACCCTGAAATCACTGTATACTTTACCAGTGTTGAGAGGTGAGCA
>uvrB_e_coli
GCCATTCTGTATTTGGTTAAATTGCGAGCGAGATCGCGTCTTCGATTGACTGCAATTTAACCAATTAAATTCTAAAATAATCACGAAAAAAATTTTACTTCCGCCTCATGCGGCGAATGTGGGAATTGCCCAGGCGGCGGGGGATAGGGGCTGGAGACAGTTATCCACTATTCCTGTGGATAACCATGTGTATTAGAGTTAGAAAACACGAGGCAAGCGAGAGAATACGCGGCTTGCACGCGAATTGGCGTTAAAGACGGCTCAAAGAAATATCTTTTATTTTTTAACTGGTTAGATAAATGCAATGGCAGTCACTGAACAGGCATCTCTTGCCATAAAACTGTCATCACTCATCTTGACAAATGTTAAAAAAGCCGTTGCTTTGGGGATAACCCGGTAAGGCCGGAGTTTTATCTCGCCACAGAGTAAATTTTGCTCATGATTGACAGCGGAGTTTACGCTGTATCAGAAATATTATGGTGATGAACTGTTTTTTTATCCAGTATAATTT

In [6]:
gene_upstream_extract('/home/nastya/Bacteroides_uniformis_annotations/E_ludwigii.fasta', 'E_ludwigii.gff', 'intergenic/E_ludwigii.fasta_intergenic.bed', gene_list)

>dinB_E_ludwigii
GTGAGTCCGATGTTTCCCCTCTTCCAGGAGGAGAGGGGAAACACACTTCTCAATCCTCATTTCTTTACTGTATACTTATCCAGTAATGAGTGAGGGCTTACT
>uvrB_E_ludwigii
TCTTTCTCTCCTTAAAAATCTGTGTGGATGAGGCTCATTGTGGGATGTGCAAAACTCCTGCCAGAAACCCTAAATTGATTAAAAAAGAGGCTTTTTTAGGAATTTGTCCTGGATAAGCGGTGTGTGATTACCCCCTTTGCACATCCGCGGTGCCGGACGTGATGAGTTGTGGTGCAGGGTTAAAATGGTGGAAATGTGAGCAAGATCGCATCGGTTGGTTATGAATGACACAACTGTTTATCAGCAAAAAATAGCTATAAAAAATTTTTGCTGCCGTTTCGTGGGAAAAGTCGGGAATTTGCTGAGCGGCGGTCTGCACGGGCTGGAAGCAGTTATCCACTATTCCTGTGGATAACCATGTGCATTAGAGTTAGAAAACACGCGATAAGCGAGAGAAGACGCGGGTTACGACCAAATTGGCGCGAAACGCGGCTTTAGAAAATATCGCTTAATATTTAATTGGTTAGATAAAAGCAATTGCCGTAATGAAAGTGTCACCTGTTGCCATAAAACTATCATTACCTGGCTTGACAAATGTTAAAAAAGAAAAAGTTCTGGGGATAACCCGTCCTGACTGGTGTTTTATGTTGTCACGGTCCAAATTTTGACCACTATCGCGCTAATCACCCTGCCGGTACTCTAAATATCCTGACATGCTACTGGTTTTTCATCCAGTGTTTTTTACTGGCATTCCTGGCAGCAACGAGTAAAATTACTCACCTGCCGCTTATAACGTCATCAGGTTGTCGCCC
>ruvA_E_ludwigii
TGACAAATCAGGCTGGATGTTTATCCAGCCTTTTTTTATTATGTCGGCAGTTAATTTCTTCCAGAACGCAGGAGCGTCACG
>polA_E_lud