# PUMA: Promoter Unraveling through Machine-learning Algorithms

### _AI model to identify promoter sequences in existing databases_

### Dependencies

Here, we check if the dependencies are installed, and if they are not, ask the user if they want to install them

In [None]:
import subprocess
import sys

# List of dependencies
dependencies = ["pandas", "scikit-learn", "seaborn", "plotnine", "sbol_utilities"]

# Function to check if a module is installed
def is_module_installed(module_name):
    try:
        __import__(module_name)
        return True
    except ImportError:
        return False

# Function to install a module
def install_module(module_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])

# Check each dependency
for module in dependencies:
    if not is_module_installed(module):
        print(f"The module '{module}' is not installed.")
        answer = input(f"Do you want to install '{module}'? (yes/no): ")
        if answer.lower() == "yes":
            install_module(module)
            print(f"'{module}' has been installed.")
        else:
            print(f"'{module}' has not been installed.")
    else:
        print(f"The module '{module}' is already installed.")

We import the packages we need

In [7]:
import seaborn as sns
import getpass
import requests
import re
import sbol3
import pandas as pd

### Accessing SynBioHub to retrieve the information

Then, we login into SynBioHub to retrieve the promoters' values. We ask the user for their credentials

In [None]:
response = requests.post(
    'https://synbiohub.org/login',
    headers={
        'Accept': 'text/plain'
    },
    data={
        'email': input('SynBioHub email: '),
        'password' : getpass.getpass('Password: '),
        },
)

print(response.status_code)
print(response.content)

Here we search for all the collections in the database

In [None]:
response = requests.get(
    'https://synbiohub.org/rootCollections',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

Here we look for a specific collection

In [None]:
response = requests.get(
    'https://synbiohub.org/user/carolusvitalis/iGEM_2019_Distribution_Kit_Promoters/iGEM_2019_Distribution_Kit_Promoters_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

In [None]:
import pandas as pd

# Search for DNA sequences using regular expression
dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', response.content.decode('utf-8'), re.DOTALL)

# Create a pandas dataframe
df = pd.DataFrame(dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
print(df)


Alternative version to run it using local SBOL files instead of SynBioHub

In [10]:
promoters_2019 = open('/Users/tyleramos/Desktop/iGEM_2019_Promoters_collection.xml').read()
print(promoters_2019)

rbs_2019 = open('/Users/tyleramos/Desktop/iGEM_2019_RBSs_collection.xml').read()
print(rbs_2019)

<?xml version="1.0" ?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:igem="http://wiki.synbiohub.org/wiki/Terms/igem#" xmlns:sbh="http://wiki.synbiohub.org/wiki/Terms/synbiohub#" xmlns:sbol="http://sbols.org/v2#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:om="http://www.ontology-of-units-of-measure.org/resource/om-2/" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <sbol:Collection rdf:about="https://synbiohub.org/user/carolusvitalis/iGEM_2019_Promoters/iGEM_2019_Promoters_collection/1">
    <sbol:persistentIdentity rdf:resource="https://synbiohub.org/user/carolusvitalis/iGEM_2019_Promoters/iGEM_2019_Promoters_collection"/>
    <sbol:displayId>iGEM_2019_Promoters_collection</sbol:displayId>
    <sbol:version>1</sbol:version>
    <dcterms:title>iGEM 2019 Distribution Promoters</dcterms:title>
    <dcterms:description>iGEM 2019 Distribution Promoters</dcterms:description>
    <dcterms:created>2024-03-23T18:59:34.492Z</d

#### Load in 2019 promoter sequences and RBSs + df cleaning

In [23]:
# Search for DNA sequences using regular expression
p19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_2019, re.DOTALL)

# Create a pandas dataframe
p19_df = pd.DataFrame(p19_dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
#print(p19_df)

rbs19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', rbs_2019, re.DOTALL)
rbs19_df = pd.DataFrame(rbs19_dna_sequences, columns=['Name', 'DNA Sequence'])
#print(rbs19_df)

# why is the first name of the df the name of the collection? 
# for now, will remove first row so that we have realistic names
p19_df = p19_df.drop(p19_df.index[0])

# add in promoter label for these seqs
p19_df['Element'] = "Promoter"

rbs19_df = rbs19_df.drop(rbs19_df.index[0])
rbs19_df['Element'] = "RBS"

print(p19_df)
print(rbs19_df)


            Name                                       DNA Sequence   Element
1    BBa_K823010                ctgatggctagctcagtcctagggattatgctagc  Promoter
2      BBa_R0010  caatacgcaaaccgcctctccccgcgcgttggccgattcattaatg...  Promoter
3    BBa_K823005                tttacagctagctcagtcctaggtattatgctagc  Promoter
4    BBa_K823012                tttatagctagctcagtcctaggtacaatgctagc  Promoter
5    BBa_K733013  aattttgtcaaaataattttattgacaacgtcttattaacgttgat...  Promoter
..           ...                                                ...       ...
144  BBa_J202014  acattgattatttgcacggcgtcacactttgctatgccatagcatt...  Promoter
145  BBa_K105024                        tttaccggaggacagtactccgacgta  Promoter
146    BBa_R0080  gcgtaacaaaagtgtctataatcacggcagaaaagtccacattgat...  Promoter
147  BBa_K823002  agtcaatgtatgaatggatacgggatatgaatcaataagtacgtga...  Promoter
148  BBa_K864400  gagctgttgacaattaatcatcggctcgtataatgtgtggaattgt...  Promoter

[148 rows x 3 columns]
            Name                        

#### Concatenate promoter and RBS dataframes for training

In [24]:
promoters_rbs_df = pd.concat([rbs19_df, p19_df], ignore_index=True)
print(promoters_rbs_df)from sklearn.model_selection import train_test_split


             Name                                       DNA Sequence   Element
0    BBa_K1114111  tactgggcccaagttcacttaaaaaggagatcaacaatgaaagcaa...       RBS
1       BBa_B0034                                       aaagaggagaaa       RBS
2      BBa_J61100                                       aaagaggggaca       RBS
3    BBa_K1114108  tactgggcccaagttcacttaaaaaggagatcaacaatgaaagcaa...       RBS
4    BBa_K1114103                       tactagagtcacacaggactactaaatg       RBS
..            ...                                                ...       ...
165   BBa_J202014  acattgattatttgcacggcgtcacactttgctatgccatagcatt...  Promoter
166   BBa_K105024                        tttaccggaggacagtactccgacgta  Promoter
167     BBa_R0080  gcgtaacaaaagtgtctataatcacggcagaaaagtccacattgat...  Promoter
168   BBa_K823002  agtcaatgtatgaatggatacgggatatgaatcaataagtacgtga...  Promoter
169   BBa_K864400  gagctgttgacaattaatcatcggctcgtataatgtgtggaattgt...  Promoter

[170 rows x 3 columns]


## Vectorizing sequences into k-mers

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Use CountVectorizer to initialize k-mer splitting 
# using k=4 for first pass
vectorizer = CountVectorizer(analyzer='char', ngram_range=(4, 4))

# apply the vectorizer to the concat. dataframe
X = vectorizer.fit_transform(promoters_rbs_df['DNA Sequence']).toarray()
y = promoters_rbs_df['Element']

## Splitting training and test set

In [33]:
# these parameters are standard, but we can tweak if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Train the model

In [34]:
model = GaussianNB()
model.fit(X_train, y_train)

## Validate

In [35]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.8837209302325582


The following code is for reference only

In [None]:
import sklearn.cluster                            # for clustering task
import sklearn.model_selection                    # function like train_test_split
from sklearn import datasets                      # sklearn build in datasets
import sklearn.decomposition                      # PCA
import sklearn.feature_extraction                 # Define image and text datas
import sklearn.feature_selection                  # Find meaningful features


In [None]:
from sklearn.model_selection import train_test_split
X_iris, y_iris = iris.drop('species', axis=1), iris['species']
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris,
                                                random_state=1)

In [None]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(Xtrain, ytrain)                  # 3. fit model to data
y_model = model.predict(Xtest)             # 4. predict on new data

In [None]:
doc = sbol.Document()
doc.read(response.content)

for component in doc.componentDefinitions:
    print(f'Display ID: {component.displayId}')
    print(f'DNA Sequence: {component.sequence.elements}')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)