### This notebook aims to retrieve the sequence of R.pom genes from the Biocyc API and produce an excel sheet compiling composition of A U G C nucleotides in each mRNA

In [1]:
import pandas as pd
import numpy as np

# Create functions used in this notebook

In [2]:
def calculate_nucleotide_composition(sequence):
    """Calculate the composition of A, U, G, C in a given RNA sequence."""
    sequence = sequence.upper()  # Convert to uppercase for consistency
    total_length = len(sequence)
    
    if total_length == 0:
        return {'A': 0, 'U': 0, 'G': 0, 'C': 0}
    
    composition = {
        'A': sequence.count('A'),
        'U': sequence.count('T'),
        'G': sequence.count('G'),
        'C': sequence.count('C')
    }
    
    return composition

def calculate_mass(composition):
    """Calculate the mass of the RNA based on nucleotide composition."""
    nucleotide_masses = {
        'A': 329.2,  # Average mass of Adenine in (Da)
        'U': 306.2,  # Average mass of Uracil
        'G': 345.2,  # Average mass of Guanine
        'C': 305.2,  # Average mass of Cytosine
    }
    
    total_mass = sum(composition[nucleotide] * nucleotide_masses[nucleotide] for nucleotide in composition)
    return total_mass + 177.8 # consider a 5' triphosphate

def verify_rna_length(sequence):
    """Verify if the sequence is a valid RNA sequence. i.e. length is divisible by 3 """
    sequence = sequence.upper()
    if len(sequence) % 3 == 0:
        return True
    else:
        if (len(sequence)+1) % 3 == 0:
            return "Missing 1 nucleotide"
        elif (len(sequence)+2) % 3 == 0:
            return "Missing 2 nucleotides"

# Load gene sequence

In [3]:
df_ECOCYC_all = pd.read_csv('All-genes-of-R.-pomeroyi-DSS-3.txt', delimiter='\t', encoding='windows-1252')
df_MORAN_all = pd.read_csv('RPOM_genome_Moran-Lab.csv')
df_RNA   = pd.read_csv('rna-abs.csv')

In [4]:
# Create a column with matching gene names as that in Moran-Lab
# we can geneerally retrieve the name from Mini review column (SPOXXXX)
df_ECOCYC_all['SPO_ID'] = None

# Take care of edge cases
for row in df_ECOCYC_all.itertuples():
    mini_review = df_ECOCYC_all.at[row.Index, 'Mini review - summary of genes/proteins']
    if not pd.isna(mini_review):
        mini_review_id = mini_review.split('Name ')[-1]
        if 'SPO' in df_ECOCYC_all.at[row.Index, 'Gene Name'] or 'SPOA' in df_ECOCYC_all.at[row.Index, 'Gene Name']:
            df_ECOCYC_all.at[row.Index, 'SPO_ID'] = df_ECOCYC_all.at[row.Index, 'Gene Name']
        elif 'SPO' in mini_review_id or 'SPOA' in mini_review_id:
            df_ECOCYC_all.at[row.Index, 'SPO_ID'] = mini_review_id
        else:
            try:
                start = mini_review.split('REF_tigr:')[1]
                SPO   = start.split(',')[0]
                df_ECOCYC_all.at[row.Index, 'SPO_ID'] = SPO
            except:
                df_ECOCYC_all.at[row.Index, 'SPO_ID'] = 'SPO_' + df_ECOCYC_all.at[row.Index, 'Gene Name'] # naming convention following Moran-Lab SPO_ID, verified
df_ECOCYC_all.head()

Unnamed: 0,Gene Name,Product,Mini review - summary of genes/proteins,Sequence - DNA sequence,SPO_ID
0,SPO2282,ribosomal protein S18,"(*protein_id: REF_tigr:SPO2282,YP_167506.1) *N...",ATGGCCGCTAAACCGTTTTTCCGCCGTCGCAAGGTGTGCCCCTTCT...,SPO2282
1,G1RHL-3012,"branched-chain amino acid ABC transporter, per...","(*protein_id: REF_tigr:SPOA0098,YP_164928.1) *...",ATGAGCATCGTCATCGACATTCTGATCAACGGGCTGTTTCTGGGCG...,SPOA0098
2,SPO3583,transcriptional regulator PecS,"(*protein_id: REF_tigr:SPO3583,YP_168778.1) *N...",ATGGACCATGTAGACGTTATCATCCGGCAATGGGCGGCCGAGCGCC...,SPO3583
3,G1RHL-354,"diguanylate cyclase, putative","(*protein_id: REF_tigr:SPO0533,YP_165795.1) *N...",ATGCGGATAAAACCGTCCCTGACCCGAATTCGAAACGCTGTCGTGC...,SPO0533
4,G1RHL-2983,TetR family transcriptional regulator,"(*protein_id: REF_tigr:SPOA0067,YP_164898.1) *...",TTGGAAAGTTCACGCCGCAGCTTTACCCGCGAATCCGCCGAACATC...,SPOA0067


# Create a Dataframe with all genes in Moran Lab (becasuse these are the genes in transcriptome) and their sequences

In [5]:
df_MORAN = df_MORAN_all[['SPO_ID (ACCESSION)', 'Gene_ID (ACCESSION)']].copy()
df_MORAN['Gene Name'] = df_MORAN['SPO_ID (ACCESSION)'].apply(lambda x: df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == x]['Gene Name'].values[0] if x in df_ECOCYC_all['SPO_ID'].values else None)
df_MORAN['Sequence'] = df_MORAN['SPO_ID (ACCESSION)'].apply(lambda x: df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == x]['Sequence - DNA sequence'].values[0] if x in df_ECOCYC_all['SPO_ID'].values 
                                                            else df_MORAN_all[df_MORAN_all['SPO_ID (ACCESSION)'] == x]['dna_sequence'].values[0])

# Create New Dataframe Analyzing % AUGC in each mRNA

In [6]:
df_ECOCYC = df_ECOCYC_all[['SPO_ID', 'Gene Name','Sequence - DNA sequence']].copy()

In [7]:
df_ECOCYC['Composition'] = df_ECOCYC['Sequence - DNA sequence'].apply(calculate_nucleotide_composition)
df_ECOCYC['mass (Da)'] = df_ECOCYC['Composition'].apply(calculate_mass)
df_ECOCYC['is_valid'] = df_ECOCYC['Sequence - DNA sequence'].apply(verify_rna_length)

df_MORAN['Composition'] = df_MORAN['Sequence'].apply(calculate_nucleotide_composition)
df_MORAN['mass (Da)'] = df_MORAN['Composition'].apply(calculate_mass)
df_MORAN['is_valid'] = df_MORAN['Sequence'].apply(verify_rna_length)

In [8]:
# show only the invalid sequences
invalid_sequences = df_ECOCYC[df_ECOCYC['is_valid'] != True]
df_MORAN[df_MORAN['is_valid'] != True]

Unnamed: 0,SPO_ID (ACCESSION),Gene_ID (ACCESSION),Gene Name,Sequence,Composition,mass (Da),is_valid
155,SPO_tRNA-Gln-1,,G1RHL-103,TGGAAGGTAGCTCAACTGGTCGGGCAACAGCTTTTGGTGCCGCAGG...,"{'A': 14, 'U': 17, 'G': 24, 'C': 21}",24686.0,Missing 2 nucleotides
205,SPO_tRNA-Pro-3,,G1RHL-135,CGGAGTGTAGCTCAGCCTGGTAGAGCACTGTCTTCGGGAGGCAGGG...,"{'A': 14, 'U': 15, 'G': 26, 'C': 22}",25069.2,Missing 1 nucleotide
240,SPO_tRNA-Ile-1,,G1RHL-158,GGGTCGGTAGCTCAGGTGGTTAGAGCGCACGCCTGATAAGCGTGAG...,"{'A': 15, 'U': 15, 'G': 26, 'C': 21}",25093.2,Missing 1 nucleotide
241,SPO_tRNA-Ala-1,,G1RHL-159,GGGGCCTTAGCTCAGCTGGGAGAGCGCCTGATTTGCATTCAGGAGG...,"{'A': 14, 'U': 17, 'G': 24, 'C': 21}",24686.0,Missing 2 nucleotides
243,SPO_Sp5SD,rrfD,Sp5SD,GGTTTGGTGGTCATAGCGAGCGCAATACACCCGGTCCCTTCCCGAA...,"{'A': 25, 'U': 20, 'G': 34, 'C': 39}",38171.4,Missing 2 nucleotides
276,SPO_tRNA-Lys-2,,G1RHL-178,GGGCCGTTAGCTCAGTTGGTAGAGCAACTGACTTTTAATCAGTGGG...,"{'A': 16, 'U': 19, 'G': 22, 'C': 19}",24656.0,Missing 2 nucleotides
479,SPO_tRNA-Arg-2,,G1RHL-324,GGTCCCATAGCTCAACTGGATAGAGCAGCTGACTTCTAATCAGCAG...,"{'A': 16, 'U': 18, 'G': 23, 'C': 20}",25000.2,Missing 1 nucleotide
543,SPO_tRNA-Arg-3,,G1RHL-362,GGACCGATAGCTCAGTTGGATAGAGTACTTGACTACGAATCAAGGG...,"{'A': 17, 'U': 17, 'G': 24, 'C': 19}",25063.2,Missing 1 nucleotide
694,SPO_tRNA-Val-2,,G1RHL-479,GGGTGATTAGCTCAGTTGGTAGAGCGCTTCGTTTACACCGAAGATG...,"{'A': 16, 'U': 20, 'G': 21, 'C': 19}",24617.0,Missing 2 nucleotides
728,SPO_tRNA-Trp-2,,G1RHL-501,AGGGGTATAGCTCAGTTGGTAGAGCGACGGTCTCCAAAACCGTAGG...,"{'A': 15, 'U': 15, 'G': 24, 'C': 22}",24708.0,Missing 2 nucleotides


# Create Dataframe for biomass calculation

In [9]:
df_RNA = df_RNA[['SPO_ID (ACCESSION)', 'DSS3_ac_mean_abund', 'DSS3_glc_mean_abund']].copy()

In [10]:
df_RNA['DSS3_ac_mean_rel_abund'] = df_RNA['DSS3_ac_mean_abund'] / df_RNA['DSS3_ac_mean_abund'].sum()
df_RNA['DSS3_glc_mean_rel_abund'] = df_RNA['DSS3_glc_mean_abund'] / df_RNA['DSS3_glc_mean_abund'].sum()
df_RNA

Unnamed: 0,SPO_ID (ACCESSION),DSS3_ac_mean_abund,DSS3_glc_mean_abund,DSS3_ac_mean_rel_abund,DSS3_glc_mean_rel_abund
0,SPO1061,2.251205e+06,6.402749e+07,5.593185e-06,1.061941e-04
1,SPO3842,2.725419e+07,2.014081e+07,6.771382e-05,3.340496e-05
2,SPO3778,2.708394e+08,1.872681e+07,6.729082e-04,3.105972e-05
3,SPO1059,2.938362e+06,1.655547e+08,7.300447e-06,2.745841e-04
4,SPO3774,1.463127e+09,2.449963e+07,3.635182e-03,4.063437e-05
...,...,...,...,...,...
4337,SPO0614,1.601365e+08,1.654388e+08,3.978639e-04,2.743919e-04
4338,SPO2306,1.511623e+08,1.197327e+08,3.755671e-04,1.985851e-04
4339,SPO_Sp16SC,3.706996e+04,6.360184e+04,9.210140e-08,1.054881e-07
4340,SPO1999,5.201761e+06,5.145175e+06,1.292393e-05,8.533634e-06


In [11]:
# TODO: calculate the mass of each RNA in the dataframe
# TODO: need weight of A, U, G, C in the biomass

# % difference between Moran Lab and ECOCYC Sequence

In [12]:
df_ECOCYC.index = df_ECOCYC['SPO_ID']
df_MORAN.index = df_MORAN['SPO_ID (ACCESSION)']

df_diff = df_ECOCYC[['Sequence - DNA sequence']].join(df_MORAN[['Sequence']], how ='outer', lsuffix='_ECOCYC', rsuffix='_MORAN')
df_diff

Unnamed: 0,Sequence - DNA sequence,Sequence
SPO0001,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...,GTGAAACATTCGGATTTCGATATTGTCGTGATCGGGGCCGGACATG...
SPO0002,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...,ATGATGGTTCCCGATGCGAACACGCTCAATGTTTCACGTGAAACAT...
SPO0003,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...,GTGTCTGATCTTTCCCGTCCTGCCGGACCCCGGATCATTGCGGTCG...
SPO0004,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...,ATGGTTTCGAACAAGCCCCGGGGATTGGGACGCGGATTGTCCGCGT...
SPO0005,ATGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCG...,ATGCAGTTTATCTGGGCAGCTCTTGGACTGGTCTGCGTTGCCCTCG...
...,...,...
SPO_tRNA-Thr-3,GCCGCTGTAGCTCAGCTGGTAGAGCACGTCATTCGTAATGATGGGG...,GCCGCTGTAGCTCAGCTGGTAGAGCACGTCATTCGTAATGATGGGG...
SPO_tRNA-Trp-2,AGGGGTATAGCTCAGTTGGTAGAGCGACGGTCTCCAAAACCGTAGG...,AGGGGTATAGCTCAGTTGGTAGAGCGACGGTCTCCAAAACCGTAGG...
SPO_tRNA-Val-2,GGGTGATTAGCTCAGTTGGTAGAGCGCTTCGTTTACACCGAAGATG...,GGGTGATTAGCTCAGTTGGTAGAGCGCTTCGTTTACACCGAAGATG...
SPO_tRNA-Val-3,GGGTGATTAGCTCAGTGGTAGAGCGCTTCGTTCACATCGAAGATGT...,GGGTGATTAGCTCAGTGGTAGAGCGCTTCGTTCACATCGAAGATGT...


In [13]:
len(df_ECOCYC)

4341

In [14]:
len(df_MORAN)

4342

In [15]:
diff = list(set(df_ECOCYC['SPO_ID']) - set(df_MORAN['SPO_ID (ACCESSION)']))

In [16]:
diff

['SPOA0411']

In [17]:
df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == 'SPO_PG039']

Unnamed: 0,Gene Name,Product,Mini review - summary of genes/proteins,Sequence - DNA sequence,SPO_ID


In [18]:
'SPOA0087a' in df_ECOCYC['SPO_ID']

False

In [19]:
len(set(df_MORAN['SPO_ID (ACCESSION)']))

4342

In [20]:
len(np.unique(df_ECOCYC['SPO_ID']))

4341

In [21]:
df_ECOCYC['SPO_ID']

SPO_ID
SPO2282      SPO2282
SPOA0098    SPOA0098
SPO3583      SPO3583
SPO0533      SPO0533
SPOA0067    SPOA0067
              ...   
SPO2202      SPO2202
SPO1955      SPO1955
SPO1883      SPO1883
SPO3337      SPO3337
SPO1400      SPO1400
Name: SPO_ID, Length: 4341, dtype: object

In [22]:
df_ECOCYC_all[df_ECOCYC_all['SPO_ID'] == 'SPO0346a']

Unnamed: 0,Gene Name,Product,Mini review - summary of genes/proteins,Sequence - DNA sequence,SPO_ID
2446,G1RHL-226,hypothetical protein,"(*protein_id: REF_tigr:SPO0346a,YP_008877637.1...",ATGCAAAGAGCTATACTAGGAATCATAGCCAGCGCGTTTCTCGTGG...,SPO0346a
