# Finding the Total Number of Bases in a Gene List

In [159]:
### standard imports

import numpy as np
import pandas as pd
import re
import plotly.express as px
import matplotlib.pyplot as plt

In [160]:
# import dataset
codons = pd.read_csv('STGA 2018 Codons Tested.csv')
codons

Unnamed: 0,Gene,Exons (codons) tested
0,AKT1 (NM_005163),"3 (16-56), 4 (59-96), 6 (146-149), 6-7 (177-19..."
1,AKT2 (NM_001626),"3 (16-55), 4 (76-96), 6 (148-153), 7 (192-210)..."
2,AKT3 (NM_005465),"2 (16-28), 2 (48-58), 3 (66-95), 4 (123-143), ..."
3,ALK (NM_004304),20-28 (1096-1388)
4,AR (NM_000044),"1 (1-45), 1 (135-178), 1 (245-285), 1 (342-388..."
...,...,...
141,TP53 (NM_000546),2-11 (1-394)
142,TSC1 (NM_000368),3-23 (1-1165)
143,TSC2 (NM_000548),"2-35 (1-1523), 37-42 (1555-1808)"
144,U2AF1 (NM_006758),"2 (15-41), 6 (137-161)"


In [161]:
codons.dtypes

Gene                     object
Exons (codons) tested    object
dtype: object

## How to Do Regular Expressions in Python
Source: https://docs.python.org/3/howto/regex.html
- "\d" matches any decimal digit, this is equivalent ot the class [0-9]
- "*" specifies that the previous character can be matched zero or more times, instead of exactly once
- "+" specifies that the previous character can be matched ONE or more times
- "?" matches either once or zero times
- "\section" isolates text string to be matched. Regular expressions use the backslash character '\' to indicate special forms or to allow special characters to be used without invoking their special meaning
- "findall()" finds all substrings where the RE matches, and returns them as a list

In [162]:
# Creating a new column for the base length of each gene
codons['Base Length'] = None

# Loops through each gene of the gene list, finds its base length, and adds it to the dataframe
for index in range(codons.shape[0]):
    # Converts the Exons of the selected gene to a string
    current_exons = str(codons.iloc[index, 1])
    
    # Extracting start and end bases using regular expressions. 
    # Matches is a list of the substrings where the RE matches
    matches = re.findall(r' \((\d+)-(\d+)\)', current_exons)
    
    # Calculating the length of each exon and summing them for selected gene
    gene_base_length = 0

    for match in matches:
        start = int(match[0])  # Start base
        end = int(match[1])  # End base

        # Calculating the base length of each exon and adding it to the total base length for the selected gene
        exon_base_length = end - start + 1
        gene_base_length = gene_base_length + exon_base_length

    # Setting the base length for the selected gene
    codons.iloc[index,2] = gene_base_length
    
codons

Unnamed: 0,Gene,Exons (codons) tested,Base Length
0,AKT1 (NM_005163),"3 (16-56), 4 (59-96), 6 (146-149), 6-7 (177-19...",253
1,AKT2 (NM_001626),"3 (16-55), 4 (76-96), 6 (148-153), 7 (192-210)...",218
2,AKT3 (NM_005465),"2 (16-28), 2 (48-58), 3 (66-95), 4 (123-143), ...",195
3,ALK (NM_004304),20-28 (1096-1388),293
4,AR (NM_000044),"1 (1-45), 1 (135-178), 1 (245-285), 1 (342-388...",287
...,...,...,...
141,TP53 (NM_000546),2-11 (1-394),394
142,TSC1 (NM_000368),3-23 (1-1165),1165
143,TSC2 (NM_000548),"2-35 (1-1523), 37-42 (1555-1808)",1777
144,U2AF1 (NM_006758),"2 (15-41), 6 (137-161)",52


In [163]:
# Calculating the total number of bases for the gene list 
num_bases = codons['Base Length'].sum()
print('The total number of bases is ' + str(num_bases))

The total number of bases is 78669


In [164]:
# Export the DataFrame to an Excel file
codons.to_excel('Total Number of Bases.xlsx', index=False)