### This notebook aims to retrieve the sequence of R.pom genes from the Biocyc API and produce an excel sheet compiling composition of A U G C nucleotides in each mRNA

In [1]:
import pandas as pd
import numpy as np

# Create functions used in this notebook

In [2]:
def calculate_nucleotide_composition(sequence):
    """Calculate the composition of A, U, G, C in a given RNA sequence."""
    sequence = sequence.upper()  # Convert to uppercase for consistency
    total_length = len(sequence)
    
    if total_length == 0:
        return {'A': 0, 'U': 0, 'G': 0, 'C': 0}
    
    composition = {
        'A': sequence.count('A'),
        'U': sequence.count('T'),
        'G': sequence.count('G'),
        'C': sequence.count('C')
    }
    
    return composition

def calculate_mass(composition):
    """Calculate the mass of the RNA based on nucleotide composition."""
    nucleotide_masses = {
        'A': 329.2,  # Average mass of Adenine in (Da)
        'U': 306.2,  # Average mass of Uracil
        'G': 345.2,  # Average mass of Guanine
        'C': 305.2,  # Average mass of Cytosine
    }
    
    total_mass = sum(composition[nucleotide] * nucleotide_masses[nucleotide] for nucleotide in composition)
    return total_mass + 177.8 # consider a 5' triphosphate

def verify_rna_length(sequence):
    """Verify if the sequence is a valid RNA sequence. i.e. length is divisible by 3 """
    sequence = sequence.upper()
    if len(sequence) % 3 == 0:
        return True
    else:
        if (len(sequence)+1) % 3 == 0:
            return "Missing 1 nucleotide"
        elif (len(sequence)+2) % 3 == 0:
            return "Missing 2 nucleotides"

# Load gene sequence

In [3]:
df_ECOCYC_all = pd.read_csv('All-genes-of-R.-pomeroyi-DSS-3.txt', delimiter='\t', encoding='windows-1252')
df_MORAN_all = pd.read_csv('RPOM_genome_Moran-Lab.csv')
df_RNA   = pd.read_csv('rna-abs.csv')

In [4]:
# Create a column with matching gene names as that in Moran-Lab
# we can geneerally retrieve the name from Mini review column (SPOXXXX)
df_ECOCYC_all['SPO_ID'] = None

# Take care of edge cases
for row in df_ECOCYC_all.itertuples():
    mini_review = df_ECOCYC_all.at[row.Index, 'Mini review - summary of genes/proteins']
    if not pd.isna(mini_review):
        mini_review_id = mini_review.split('Name ')[-1]
        if 'SPO' in df_ECOCYC_all.at[row.Index, 'Gene Name'] or 'SPOA' in df_ECOCYC_all.at[row.Index, 'Gene Name']:
            df_ECOCYC_all.at[row.Index, 'SPO_ID'] = df_ECOCYC_all.at[row.Index, 'Gene Name']
        elif 'SPO' in mini_review_id or 'SPOA' in mini_review_id:
            df_ECOCYC_all.at[row.Index, 'SPO_ID'] = mini_review_id
        else:
            try:
                start = mini_review.split('REF_tigr:')[1]
                SPO   = start.split(',')[0]
                df_ECOCYC_all.at[row.Index, 'SPO_ID'] = SPO
            except:
                df_ECOCYC_all.at[row.Index, 'SPO_ID'] = 'SPO_' + df_ECOCYC_all.at[row.Index, 'Gene Name'] # naming convention following Moran-Lab SPO_ID, verified
df_ECOCYC_all.head()

Unnamed: 0,Gene Name,Product,Mini review - summary of genes/proteins,Sequence - DNA sequence,SPO_ID
0,SPO2282,ribosomal protein S18,"(*protein_id: REF_tigr:SPO2282,YP_167506.1) *N...",ATGGCCGCTAAACCGTTTTTCCGCCGTCGCAAGGTGTGCCCCTTCT...,SPO2282
1,G1RHL-3012,"branched-chain amino acid ABC transporter, per...","(*protein_id: REF_tigr:SPOA0098,YP_164928.1) *...",ATGAGCATCGTCATCGACATTCTGATCAACGGGCTGTTTCTGGGCG...,SPOA0098
2,SPO3583,transcriptional regulator PecS,"(*protein_id: REF_tigr:SPO3583,YP_168778.1) *N...",ATGGACCATGTAGACGTTATCATCCGGCAATGGGCGGCCGAGCGCC...,SPO3583
3,G1RHL-354,"diguanylate cyclase, putative","(*protein_id: REF_tigr:SPO0533,YP_165795.1) *N...",ATGCGGATAAAACCGTCCCTGACCCGAATTCGAAACGCTGTCGTGC...,SPO0533
4,G1RHL-2983,TetR family transcriptional regulator,"(*protein_id: REF_tigr:SPOA0067,YP_164898.1) *...",TTGGAAAGTTCACGCCGCAGCTTTACCCGCGAATCCGCCGAACATC...,SPOA0067


# Create a Dataframe with all genes in Moran Lab (becasuse these are the genes in transcriptome) and their sequences

In [5]:
df_MORAN = df_MORAN_all[['SPO_ID (ACCESSION)', 'Gene_ID (ACCESSION)', 'dna_sequence']].copy()
df_MORAN['Gene Name'] = df_MORAN['SPO_ID (ACCESSION)'].apply(lambda x: df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == x]['Gene Name'].values[0] if x in df_ECOCYC_all['SPO_ID'].values else None)
df_MORAN['Corrected Sequence'] = df_MORAN['SPO_ID (ACCESSION)'].apply(lambda x: df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == x]['Sequence - DNA sequence'].values[0] if x in df_ECOCYC_all['SPO_ID'].values 
                                                            else df_MORAN_all[df_MORAN_all['SPO_ID (ACCESSION)'] == x]['dna_sequence'].values[0])

# Create New Dataframe Analyzing % AUGC in each mRNA

In [6]:
df_ECOCYC = df_ECOCYC_all[['SPO_ID', 'Gene Name','Sequence - DNA sequence']].copy()

In [7]:
df_ECOCYC['Composition'] = df_ECOCYC['Sequence - DNA sequence'].apply(calculate_nucleotide_composition)
df_ECOCYC['mass (Da)'] = df_ECOCYC['Composition'].apply(calculate_mass)
df_ECOCYC['is_valid'] = df_ECOCYC['Sequence - DNA sequence'].apply(verify_rna_length)

df_MORAN['Composition'] = df_MORAN['Corrected Sequence'].apply(calculate_nucleotide_composition)
df_MORAN['mass (Da)'] = df_MORAN['Composition'].apply(calculate_mass)
df_MORAN['is_valid'] = df_MORAN['Corrected Sequence'].apply(verify_rna_length)

In [8]:
# show only the invalid sequences
invalid_sequences = df_ECOCYC[df_ECOCYC['is_valid'] != True]
df_MORAN[df_MORAN['is_valid'] != True]
df_MORAN.index = df_MORAN['SPO_ID (ACCESSION)']
df_MORAN

Unnamed: 0_level_0,SPO_ID (ACCESSION),Gene_ID (ACCESSION),dna_sequence,Gene Name,Corrected Sequence,Composition,mass (Da),is_valid
SPO_ID (ACCESSION),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SPO0001,SPO0001,gidA,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...,SPO0001,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...,"{'A': 367, 'U': 393, 'G': 587, 'C': 522}",603277.6,True
SPO0002,SPO0002,gidB,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...,SPO0002,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...,"{'A': 136, 'U': 144, 'G': 174, 'C': 161}",198243.8,True
SPO0003,SPO0003,parA,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...,SPO0003,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...,"{'A': 169, 'U': 171, 'G': 239, 'C': 225}",259345.6,True
SPO0004,SPO0004,parB,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...,SPO0004,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...,"{'A': 194, 'U': 154, 'G': 287, 'C': 256}",288401.0,True
SPO0005,SPO0005,SPO0005,TGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCGC...,G1RHL-1,ATGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCG...,"{'A': 45, 'U': 95, 'G': 101, 'C': 113}",113433.6,True
...,...,...,...,...,...,...,...,...
SPOA0442,SPOA0442,SPOA0442,ATGCTCGACTCTGTGGGGCGTTTTTGCTATCATGCCGCCACTTTTC...,G1RHL-3294,ATGCTCGACTCTGTGGGGCGTTTTTGCTATCATGCCGCCACTTTTC...,"{'A': 14, 'U': 34, 'G': 26, 'C': 22}",30887.0,True
SPOA0443,SPOA0443,SPOA0443,ATGCCAGAAGACGGAATCGGTTTTGAATTCTTCAGAGACGATCAAA...,G1RHL-3295,ATGCCAGAAGACGGAATCGGTTTTGAATTCTTCAGAGACGATCAAA...,"{'A': 66, 'U': 45, 'G': 78, 'C': 75}",85499.6,True
SPOA0444,SPOA0444,fabA,TGGCCCAATACCCGAGCAGCTTTGACAAGGAAGACCTGCTGAAATG...,SPOA0444,ATGGCCCAATACCCGAGCAGCTTTGACAAGGAAGACCTGCTGAAAT...,"{'A': 96, 'U': 93, 'G': 172, 'C': 149}",165106.8,True
SPOA0445,SPOA0445,SPOA0445,ATGACGCCAAATTCTCAGGAAATCGCCACCGATTGGCTGGTCGATG...,G1RHL-3296,ATGACGCCAAATTCTCAGGAAATCGCCACCGATTGGCTGGTCGATG...,"{'A': 71, 'U': 72, 'G': 140, 'C': 137}",135737.8,True


# Create Dataframe for biomass calculation

In [9]:
df_RNA.index = df_RNA['SPO_ID (ACCESSION)']
df_RNA = df_RNA[['DSS3_ac_mean_abund', 'DSS3_glc_mean_abund']].copy()

In [10]:
df_RNA['DSS3_ac_mean_rel_abund'] = df_RNA['DSS3_ac_mean_abund'] / df_RNA['DSS3_ac_mean_abund'].sum()
df_RNA['DSS3_glc_mean_rel_abund'] = df_RNA['DSS3_glc_mean_abund'] / df_RNA['DSS3_glc_mean_abund'].sum()
df_RNA

Unnamed: 0_level_0,DSS3_ac_mean_abund,DSS3_glc_mean_abund,DSS3_ac_mean_rel_abund,DSS3_glc_mean_rel_abund
SPO_ID (ACCESSION),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SPO1061,2.251205e+06,6.402749e+07,5.593185e-06,1.061941e-04
SPO3842,2.725419e+07,2.014081e+07,6.771382e-05,3.340496e-05
SPO3778,2.708394e+08,1.872681e+07,6.729082e-04,3.105972e-05
SPO1059,2.938362e+06,1.655547e+08,7.300447e-06,2.745841e-04
SPO3774,1.463127e+09,2.449963e+07,3.635182e-03,4.063437e-05
...,...,...,...,...
SPO0614,1.601365e+08,1.654388e+08,3.978639e-04,2.743919e-04
SPO2306,1.511623e+08,1.197327e+08,3.755671e-04,1.985851e-04
SPO_Sp16SC,3.706996e+04,6.360184e+04,9.210140e-08,1.054881e-07
SPO1999,5.201761e+06,5.145175e+06,1.292393e-05,8.533634e-06


In [25]:
# TODO: calculate the mass of each RNA in the dataframe
# TODO: need weight of A, U, G, C in the biomass
def composition_mass(relative_abundance, composition):
    """Calculate the relative abundant mass of the RNA based on nucleotide composition."""
    nucleotide_masses = {
        'A': 329.2,  # Average mass of Adenine in (Da)
        'U': 306.2,  # Average mass of Uracil
        'G': 345.2,  # Average mass of Guanine
        'C': 305.2,  # Average mass of Cytosine
    }
    
    mass_per_composition = {}
    for nucleotide, count in composition.items():
        mass_per_composition[nucleotide] = nucleotide_masses[nucleotide] * count * relative_abundance
    return mass_per_composition

def calculate_biomass_per_composition(rel_mass_per_nucleotide):
    """Calculate the total mass of RNA based on relative mass per nucleotide."""
    total_mass = {'A': 0, 'U': 0, 'G': 0, 'C': 0}
    for composition_dict in rel_mass_per_nucleotide:
        for nucleotide, mass in composition_dict.items():
            total_mass[nucleotide] += mass
    return total_mass


In [12]:
# Join df_RNA and df_Moran by index
df_RNA = df_RNA.join(df_MORAN[['Composition', 'mass (Da)', 'is_valid']], how='outer')

In [13]:
df_RNA['rel mass RNA acetate'] = df_RNA['DSS3_ac_mean_rel_abund'] * df_RNA['mass (Da)']
df_RNA['rel mass RNA glucose'] = df_RNA['DSS3_glc_mean_rel_abund'] * df_RNA['mass (Da)']
df_RNA['rel mass per nucleotide acetate'] = df_RNA.apply(lambda x: composition_mass(x['DSS3_ac_mean_rel_abund'], x['Composition']), axis=1)
df_RNA['rel mass per nucleotide glucose'] = df_RNA.apply(lambda x: composition_mass(x['DSS3_glc_mean_rel_abund'], x['Composition']), axis=1)
df_RNA

Unnamed: 0_level_0,DSS3_ac_mean_abund,DSS3_glc_mean_abund,DSS3_ac_mean_rel_abund,DSS3_glc_mean_rel_abund,Composition,mass (Da),is_valid,rel mass RNA acetate,rel mass RNA glucose,rel mass per nucleotide acetate,rel mass per nucleotide glucose
SPO_ID (ACCESSION),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SPO0001,2.619493e+07,3.707165e+07,0.000065,0.000061,"{'A': 367, 'U': 393, 'G': 587, 'C': 522}",603277.6,True,39.262547,37.093083,"{'A': 7.862979760773626, 'U': 7.83175339010524...","{'A': 7.428508438201976, 'U': 7.39900749008028..."
SPO0002,1.769851e+07,2.372913e+07,0.000044,0.000039,"{'A': 136, 'U': 144, 'G': 174, 'C': 161}",198243.8,True,8.717266,7.802169,"{'A': 1.9686994676709026, 'U': 1.9388685558600...","{'A': 1.762034696692197, 'U': 1.73533529309711..."
SPO0003,4.731299e+07,6.088552e+07,0.000118,0.000101,"{'A': 169, 'U': 171, 'G': 239, 'C': 225}",259345.6,True,30.486204,26.189472,"{'A': 6.539898247486959, 'U': 6.15496739842808...","{'A': 5.618163739289428, 'U': 5.28748511762318..."
SPO0004,9.820402e+07,1.154940e+08,0.000244,0.000192,"{'A': 194, 'U': 154, 'G': 287, 'C': 256}",288401.0,True,70.367172,55.244639,"{'A': 15.582419498655666, 'U': 11.505334315228...","{'A': 12.23361843652406, 'U': 9.03273525714642..."
SPO0005,5.729765e+07,1.114025e+08,0.000142,0.000185,"{'A': 45, 'U': 95, 'G': 101, 'C': 113}",113433.6,True,16.148142,20.959000,"{'A': 2.108886394537551, 'U': 4.14104200963297...","{'A': 2.737166348526108, 'U': 5.37474226490319..."
...,...,...,...,...,...,...,...,...,...,...,...
SPO_tRNA-Thr-3,8.724674e+08,9.167270e+08,0.002168,0.001520,"{'A': 13, 'U': 17, 'G': 26, 'C': 20}",24742.0,Missing 2 nucleotides,53.632503,37.619124,"{'A': 9.27676265296583, 'U': 11.28359199779146...","{'A': 6.506943807038927, 'U': 7.91458203877942..."
SPO_tRNA-Trp-2,1.552103e+07,3.918915e+07,0.000039,0.000065,"{'A': 15, 'U': 15, 'G': 24, 'C': 22}",24708.0,Missing 2 nucleotides,0.952801,1.605969,"{'A': 0.19042141206117558, 'U': 0.177117364438...","{'A': 0.32095989134805625, 'U': 0.298535597602..."
SPO_tRNA-Val-2,3.705814e+07,3.014255e+07,0.000092,0.000050,"{'A': 16, 'U': 20, 'G': 21, 'C': 19}",24617.0,Missing 2 nucleotides,2.266537,1.230691,"{'A': 0.484961759605797, 'U': 0.56384906892198...","{'A': 0.2633259812232391, 'U': 0.3061604474884..."
SPO_tRNA-Val-3,3.366227e+08,3.650615e+08,0.000836,0.000605,"{'A': 19, 'U': 20, 'G': 18, 'C': 18}",24263.8,True,20.292999,14.691249,"{'A': 5.231194094010626, 'U': 5.12179967892196...","{'A': 3.787157093612742, 'U': 3.70796029310040..."


In [15]:
# Output the dataframe to an excel file
df_out = df_RNA[['DSS3_ac_mean_rel_abund', 'DSS3_glc_mean_rel_abund','Composition','rel mass RNA acetate','rel mass RNA glucose', 'rel mass per nucleotide acetate', 'rel mass per nucleotide glucose','is_valid']].copy()
df_out.to_csv('glc_ac_RNA_mass.csv', index=True, encoding='utf-8-sig')

In [28]:
rel_biomass_acetate = calculate_biomass_per_composition(df_RNA['rel mass per nucleotide acetate'].values)
rel_biomass_glucose = calculate_biomass_per_composition(df_RNA['rel mass per nucleotide glucose'].values)
print('The relative biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is:', rel_biomass_acetate)
print('The relative biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is:', rel_biomass_glucose)

The relative biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is: {'A': 47165.67070060239, 'U': 41158.74435912194, 'G': 85027.76816331533, 'C': 71026.30844197425}
The relative biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is: {'A': 42836.204408709615, 'U': 36825.57407874009, 'G': 77626.86999287529, 'C': 64395.00021819996}


In [33]:
# Calculate the biomass per composition for rRNA
rRNA_IDs = ['SPO_Sp5SD', 'SPO_Sp5SF', 'SPO_Sp5SE', 'SPO_Sp16SA', 'SPO_Sp16SC', 'SPO_Sp16SB', 'SPO_Sp23SB', 'SPO_Sp23SA', 'SPO_Sp23SC']
df_rRNA = df_RNA[df_RNA.index.isin(rRNA_IDs)].copy()

rel_biomass_acetate_rRNA = calculate_biomass_per_composition(df_rRNA['rel mass per nucleotide acetate'].values)
rel_biomass_glucose_rRNA = calculate_biomass_per_composition(df_rRNA['rel mass per nucleotide glucose'].values)
print('The relative rRNA biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is:', rel_biomass_acetate_rRNA)
print('The relative rRNA biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is:', rel_biomass_glucose_rRNA)

The relative rRNA biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is: {'A': 2518.29475008572, 'U': 1818.4277443706992, 'G': 3269.0254224366827, 'C': 2317.168363918941}
The relative rRNA biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is: {'A': 2994.508568902258, 'U': 2164.1113314538734, 'G': 3895.2464138265104, 'C': 2784.5781988480894}


In [37]:
df_tmRNA = df_RNA[~df_RNA.index.isin(rRNA_IDs)].copy()
rel_biomass_acetate_tmRNA = calculate_biomass_per_composition(df_tmRNA['rel mass per nucleotide acetate'].values)
rel_biomass_glucose_tmRNA = calculate_biomass_per_composition(df_tmRNA['rel mass per nucleotide glucose'].values)
print('The relative tRNA and mRNA biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is:', rel_biomass_acetate_tmRNA)
print('The relative tRNA and mRNA biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is:', rel_biomass_glucose_tmRNA)

The relative tRNA and mRNA biomass of each nucleotide in R. pomeroyi DSS-3 on acetate is: {'A': 44647.375950516674, 'U': 39340.31661475123, 'G': 81758.74274087865, 'C': 68709.1400780553}
The relative tRNA and mRNA biomass of each nucleotide in R. pomeroyi DSS-3 on glucose is: {'A': 39841.69583980736, 'U': 34661.462747286205, 'G': 73731.62357904877, 'C': 61610.42201935188}


# % difference between Moran Lab and ECOCYC Sequence

In [None]:
df_ECOCYC.index = df_ECOCYC['SPO_ID']
df_MORAN.index = df_MORAN['SPO_ID (ACCESSION)']

df_diff = df_ECOCYC[['Sequence - DNA sequence']].join(df_MORAN[['dna_sequence']], how ='outer', lsuffix='_ECOCYC', rsuffix='_MORAN')
df_diff.columns = ['ECOCYC Sequence', 'MORAN Sequence']
df_diff['is_same'] = df_diff['ECOCYC Sequence'] == df_diff['MORAN Sequence']
df_diff

In [None]:
df_diff['Comment'] = None
for row in df_diff.itertuples():
    moran_seq = df_diff.at[row.Index, 'MORAN Sequence'] 
    ecocyc_seq = df_diff.at[row.Index, 'ECOCYC Sequence'] 
    if not pd.isna(ecocyc_seq) and not pd.isna(moran_seq) and (len(ecocyc_seq) - len(moran_seq) == 1):
        if ecocyc_seq[1:] == moran_seq:
            df_diff.at[row.Index, 'Comment'] = 'missing first nucleotides of Ecocyc sequence'
        elif ecocyc_seq[:-1] == moran_seq:
            df_diff.at[row.Index, 'Comment'] = 'missing last nucleotides of Ecocyc sequence'
    else:
        df_diff.at[row.Index, 'Comment'] = 'Sequence does not match'
df_diff

In [None]:
df_diff[df_diff['Comment'] == 'Sequence does not match']

In [None]:
df_diff.to_csv("Moran Ecocyc Sequence Comparison.csv", index=True, encoding='utf-8-sig')

In [None]:
t1 = 'GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATGCCGGCGCCGAGGCTGCACATGCTGCGGCACGCATGGGAATGCGTACTGCCTTGGTTTCCCTGTCCGAACGCGACATTGGCGTGATGTCCTGTAACCCGGCTATTGGCGGTTTGGGTAAGGGGCATCTGGTTCGCGAGATCGACGCACTTGACGGGGTCATGGGGCGGGTGGCCGACAAGGCCGGGATTCAATTTCGTCTGCTCAATCGGCGCAAGGGTCCTGCGGTCCAGGGGCCGCGCGCTCAGGCCGATCGCAAGCTCTATCGCCTGGCGATGCAGGAAGAAATGCGCAATCGCCCCGGACTGACCATCGTCGAGGGCGAGGTTACTGATTTTCGAATGCAAGGTGATCGCGTTGCCGGCGTCGTTCTGGCCGATGGGTCCGAGATTGCATCGCAAGCCGTGATCTTGACCTCGGGCACGTTCCTGCGCGGGATCATTCATATTGGCGATGTTTCGCGCCCCGGTGGGCGGATGGGTGACCGACCTTCAGTTCCGCTTGCCGAAAGGCTGGATGGATTTGCGTTACCGATGGGGCGACTAAAGACCGGAACGCCGCCGCGACTGGACGGACGCACCATAGACTGGTCTATCCTCGAGCGTCAGGACGGGGATGACGATCCGGTGCTGTTCTCGTTCCTGTCAAAAGGGGCCTATGCGCGTCAGATCGCCTGCGGTATCACGCATACCAACGCCCAGACGCATGAGATCATTCGCAAAAACCTGTCCCGTTCCGCGATGTATGGCGGACATATCGAAGGCGTCGGCCCTCGTTACTGCCCCTCGATCGAGGACAAAATCGTGCGTTTCGCTGACAAGGATTCGCATCAGATCTTTCTTGAGCCCGAGGGGCTGGAAGATCACACTGTCTATCCGAACGGTATCTCGACCTCGTTGCCGGTCGACGTGCAAGAGGACTATGTCCGTTCGATCCGGGGGCTGGAACAGGTTGAGATCTTGCAGCCCGGTTACGCGATCGAATACGACTATGTTGATCCCCGCGCGCTCACGTCGCAACTGTCTCTGCCGAATGTTCCGGGCTTGTACCTAGCCGGTCAGATCAATGGAACAACCGGGTATGAAGAGGCTGCGGCGCAGGGAATGGTTGCCGGGTTGAACGCAGCCACCGCGATTTTGGGTCACGAACCGGTCCCCTTCAGTCGTGCCAACAGCTATATCGGTGTGATGATCGACGATCTGACCACACGCGGTGTGGCAGAGCCCTATCGCATGTTCACCTCGCGGGCCGAATTCCGCCTGTCCTTGCGTGCGGATAACGCGGATCAGCGGCTGACGCCCTTGGGATTGGAATTGGGTTGTGTCGGTGACGAACGGCGCGACGTCTTTGCGCGTAAGGCAGAGAAACTGGCGACAGCCTCGGCGCTGCTCGATCAGAGCAGCTTTAGCCCCAAAGAGATTGCCACAGCTGGTATCACCATCAGTCAGGACGGCAATCGGAGAAACGGCTTCGCTGTCCTCGCCTTTCCGGACGTCAGGTTTGACGACCTTGTGCCACTAATCCCGGAACTTGCAGACACCGACGCCGAAACCCGCGCGCAGGTAGAGCGTGATGCGCTTTATGCCAATTACATCGCGCGGCAGGAACGGGATGTTGAGGCGATGAAACGGGATGAGGCGCTGGTCATTCCGATCGACTTCAACTTTTCTGCCCTCGATGGGTTGTCAAATGAGTTGAAGCAAAAACTCACATCCGCGCGACCGGAAAATATCGCGCAGGCCGGGCGTGTTGAGGGGATGACGCCAGCGGCGCTTGCGCTCATCCTGGCCCGCCTTCGCCGTGGCGACCGAGCACGAAGCGCATGA'
t2 = 'GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATGCCGGCGCCGAGGCTGCACATGCTGCGGCACGCATGGGAATGCGTACTGCCTTGGTTTCCCTGTCCGAACGCGACATTGGCGTGATGTCCTGTAACCCGGCTATTGGCGGTTTGGGTAAGGGGCATCTGGTTCGCGAGATCGACGCACTTGACGGGGTCATGGGGCGGGTGGCCGACAAGGCCGGGATTCAATTTCGTCTGCTCAATCGGCGCAAGGGTCCTGCGGTCCAGGGGCCGCGCGCTCAGGCCGATCGCAAGCTCTATCGCCTGGCGATGCAGGAAGAAATGCGCAATCGCCCCGGACTGACCATCGTCGAGGGCGAGGTTACTGATTTTCGAATGCAAGGTGATCGCGTTGCCGGCGTCGTTCTGGCCGATGGGTCCGAGATTGCATCGCAAGCCGTGATCTTGACCTCGGGCACGTTCCTGCGCGGGATCATTCATATTGGCGATGTTTCGCGCCCCGGTGGGCGGATGGGTGACCGACCTTCAGTTCCGCTTGCCGAAAGGCTGGATGGATTTGCGTTACCGATGGGGCGACTAAAGACCGGAACGCCGCCGCGACTGGACGGACGCACCATAGACTGGTCTATCCTCGAGCGTCAGGACGGGGATGACGATCCGGTGCTGTTCTCGTTCCTGTCAAAAGGGGCCTATGCGCGTCAGATCGCCTGCGGTATCACGCATACCAACGCCCAGACGCATGAGATCATTCGCAAAAACCTGTCCCGTTCCGCGATGTATGGCGGACATATCGAAGGCGTCGGCCCTCGTTACTGCCCCTCGATCGAGGACAAAATCGTGCGTTTCGCTGACAAGGATTCGCATCAGATCTTTCTTGAGCCCGAGGGGCTGGAAGATCACACTGTCTATCCGAACGGTATCTCGACCTCGTTGCCGGTCGACGTGCAAGAGGACTATGTCCGTTCGATCCGGGGGCTGGAACAGGTTGAGATCTTGCAGCCCGGTTACGCGATCGAATACGACTATGTTGATCCCCGCGCGCTCACGTCGCAACTGTCTCTGCCGAATGTTCCGGGCTTGTACCTAGCCGGTCAGATCAATGGAACAACCGGGTATGAAGAGGCTGCGGCGCAGGGAATGGTTGCCGGGTTGAACGCAGCCACCGCGATTTTGGGTCACGAACCGGTCCCCTTCAGTCGTGCCAACAGCTATATCGGTGTGATGATCGACGATCTGACCACACGCGGTGTGGCAGAGCCCTATCGCATGTTCACCTCGCGGGCCGAATTCCGCCTGTCCTTGCGTGCGGATAACGCGGATCAGCGGCTGACGCCCTTGGGATTGGAATTGGGTTGTGTCGGTGACGAACGGCGCGACGTCTTTGCGCGTAAGGCAGAGAAACTGGCGACAGCCTCGGCGCTGCTCGATCAGAGCAGCTTTAGCCCCAAAGAGATTGCCACAGCTGGTATCACCATCAGTCAGGACGGCAATCGGAGAAACGGCTTCGCTGTCCTCGCCTTTCCGGACGTCAGGTTTGACGACCTTGTGCCACTAATCCCGGAACTTGCAGACACCGACGCCGAAACCCGCGCGCAGGTAGAGCGTGATGCGCTTTATGCCAATTACATCGCGCGGCAGGAACGGGATGTTGAGGCGATGAAACGGGATGAGGCGCTGGTCATTCCGATCGACTTCAACTTTTCTGCCCTCGATGGGTTGTCAAATGAGTTGAAGCAAAAACTCACATCCGCGCGACCGGAAAATATCGCGCAGGCCGGGCGTGTTGAGGGGATGACGCCAGCGGCGCTTGCGCTCATCCTGGCCCGCCTTCGCCGTGGCGACCGAGCACGAAGCGCATG'

In [None]:
np.isin(t1, t2)

In [None]:
df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == 'SPOA0272a']

In [None]:
'SPOA0087a' in df_ECOCYC['SPO_ID']

In [None]:
len(set(df_MORAN['SPO_ID (ACCESSION)']))

In [None]:
len(np.unique(df_ECOCYC['SPO_ID']))

In [None]:
df_ECOCYC['SPO_ID']

In [None]:
df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == 'SPOA0411']

In [None]:
df_MORAN_all[df_MORAN_all['SPO_ID (ACCESSION)'] == 'SPO0005']

In [None]:
df_MORAN[df_MORAN['SPO_ID (ACCESSION)'] == 'SPO0005']