# AI model to identify promoter sequences in existing databases

### Dependencies

Here, we check if the dependencies are installed, and if they are not, ask the user if they want to install them

In [11]:
import subprocess
import sys

# List of dependencies
dependencies = ["pandas", "scikit-learn", "seaborn", "plotnine", "sbol_utilities"]

# Function to check if a module is installed
def is_module_installed(module_name):
    try:
        __import__(module_name)
        return True
    except ImportError:
        return False

# Function to install a module
def install_module(module_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])

# Check each dependency
for module in dependencies:
    if not is_module_installed(module):
        print(f"The module '{module}' is not installed.")
        answer = input(f"Do you want to install '{module}'? (yes/no): ")
        if answer.lower() == "yes":
            install_module(module)
            print(f"'{module}' has been installed.")
        else:
            print(f"'{module}' has not been installed.")
    else:
        print(f"The module '{module}' is already installed.")

The module 'pandas' is already installed.
The module 'scikit-learn' is not installed.
'scikit-learn' has been installed.
The module 'seaborn' is already installed.
The module 'plotnine' is already installed.


We import the packages we need

In [15]:
import seaborn as sns
import getpass
import requests
#import sbol

### Accessing SynBioHub to retrieve the information

Then, we login into SynBioHub to retrieve the promoters' values. We ask the user for their credentials

In [18]:
response = requests.post(
    'https://synbiohub.org/login',
    headers={
        'Accept': 'text/plain'
    },
    data={
        'email': input('SynBioHub email: '),
        'password' : getpass.getpass('Password: '),
        },
)

print(response.status_code)
print(response.content)

200
b'11ff5429-71e7-48dc-87ba-87a8a097a45a'


Here we search for all the collections in the database

In [21]:
response = requests.get(
    'https://synbiohub.org/rootCollections',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

200
b'[{"uri":"https://synbiohub.org/public/CnDatabase/CnDatabase_collection/1","name":"Cryptococcus neoformans Database in Synthetic Biology Open Language","description":"This is a collection of Biological Genetic Parts of Cryptococcus neoformans containing 23 different promoters, coding sequences and terminators, which were previously used and whose efficacy has already been proven in this organism. This work was made in order to facilitate the build and assembly of genetic circuits and plasmids for the study of this pathogen.","displayId":"CnDatabase_collection","version":"1"},{"uri":"https://synbiohub.org/public/bsu/bsu_collection/1","name":"Bacillus subtilis Collection","description":"This collection includes information about promoters, operators, CDSs and proteins from Bacillus subtilis. Functional interactions such as transcriptional activation and repression, protein production and various protein-protein interactions are also included.","displayId":"bsu_collection","version":

Here we look for a specific collection

In [22]:
response = requests.get(
    'https://synbiohub.org/public/CnDatabase/CnDatabase_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': 'a75bda90-1972-4e25-8f57-4642027b33cb'
        },
)

print(response.status_code)
print(response.content)

200
b'<?xml version="1.0" ?>\n<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:sbh="http://wiki.synbiohub.org/wiki/Terms/synbiohub#" xmlns:sbol="http://sbols.org/v2#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:ns0="http://purl.obolibrary.org/obo/" xmlns:prov="http://www.w3.org/ns/prov#" xmlns:om="http://www.ontology-of-units-of-measure.org/resource/om-2/" xmlns:dc="http://purl.org/dc/elements/1.1/">\n  <sbol:Collection rdf:about="https://synbiohub.org/public/CnDatabase/CnD_collection/1">\n    <sbol:persistentIdentity rdf:resource="https://synbiohub.org/public/CnDatabase/CnD_collection"/>\n    <sbol:displayId>CnD_collection</sbol:displayId>\n    <sbol:version>1</sbol:version>\n    <dcterms:title>Cryptococcus neoformans Database in Synthetic Biology Open Language</dcterms:title>\n    <dcterms:description>This is a collection of Biological Genetic Parts of Cryptococcus neoformans containing 23 different promoters, coding sequences and terminators, which were pre

In [None]:
response = requests.get(
    'https://synbiohub.org/user/carolusvitalis/iGEM_2019_Distribution_Kit_Promoters/iGEM_2019_Distribution_Kit_Promoters_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': 'a75bda90-1972-4e25-8f57-4642027b33cb'
        },
)

print(response.status_code)
print(response.content)

In [28]:
import re

# Search for DNA sequence using regular expression
dna_sequence = re.search(r'<sbol:elements>(.*?)</sbol:elements>', response.content.decode('utf-8'), re.DOTALL)

if dna_sequence:
    dna_sequence = dna_sequence.group(1)
    print(f"{dna_sequence}")
else:
    print("DNA Sequence not found.")


gcggggagtacaggctaagcgtaggcataagcatcagtgtgatggtcagggacccttgacaatcctcacaaagagcgagtaatctatatgctcccctaacacattgactcttcccggtgctcgaacaatatagttgggttgcccgccaaactcttttacaaaagtttgaactagagcgctccacctttgcctgtgatgaacaaatacagtgtcagccggttgattctgttatatctgcgtaacaacgtacctctcccgctgaagagcttcctctgtgttataaatctcattgagattgtgaaatgcaggaacaggtcgtagctcctccggacccattttgaacatgatatacctgcggaaaacaagaccgtcgattcggctttgtgggcgatataaatagtggtttcattgtctggactaagtgggctgtctttccgatggcgctcgcaacgtgtcaacgtatacttgcaaaagttgctatgtacaatatcgaaagacatttacaaatcggcatgaaggtttgtgtggttcgctgtgaattgtccctgaataatgcacgcggacactctccggacaaacgatccggacttgagccctgctggcaagaaatagaaattattcccatttgatgtcatcatcgatgcaacagaattcccgtttatcattttctcttctacacattcccgttcgcagaattcccgttcatcgcttcctctactacacattcccgttcgcagaattgcccatcctcgaaattcccgttcgaacaaagcattatttgggcaagattcccgttcgtgctcgggtccaattctgagtattcccgttcatgattctgttgcatcgactttccctaataggtgttgggaagaggatgtcagcatccagtataaaagatgtagaatgcagcctcaatgaccatgagctatgatagcaaacattgaatcgcaataacacactcaaccgctcacctgcatt


In [33]:
import numpy as np

if "DNA Sequence not found." not in dna_sequence:
    table = np.array([[1, dna_sequence]])
    print(table)
else:
    print("Your data is not DNA only.")

[['1'
  'gcggggagtacaggctaagcgtaggcataagcatcagtgtgatggtcagggacccttgacaatcctcacaaagagcgagtaatctatatgctcccctaacacattgactcttcccggtgctcgaacaatatagttgggttgcccgccaaactcttttacaaaagtttgaactagagcgctccacctttgcctgtgatgaacaaatacagtgtcagccggttgattctgttatatctgcgtaacaacgtacctctcccgctgaagagcttcctctgtgttataaatctcattgagattgtgaaatgcaggaacaggtcgtagctcctccggacccattttgaacatgatatacctgcggaaaacaagaccgtcgattcggctttgtgggcgatataaatagtggtttcattgtctggactaagtgggctgtctttccgatggcgctcgcaacgtgtcaacgtatacttgcaaaagttgctatgtacaatatcgaaagacatttacaaatcggcatgaaggtttgtgtggttcgctgtgaattgtccctgaataatgcacgcggacactctccggacaaacgatccggacttgagccctgctggcaagaaatagaaattattcccatttgatgtcatcatcgatgcaacagaattcccgtttatcattttctcttctacacattcccgttcgcagaattcccgttcatcgcttcctctactacacattcccgttcgcagaattgcccatcctcgaaattcccgttcgaacaaagcattatttgggcaagattcccgttcgtgctcgggtccaattctgagtattcccgttcatgattctgttgcatcgactttccctaataggtgttgggaagaggatgtcagcatccagtataaaagatgtagaatgcagcctcaatgaccatgagctatgatagcaaacattgaatcgcaataacacactcaaccgctcacctgcatt']]


The following code is for reference only

In [None]:
import sklearn.cluster                            # for clustering task
import sklearn.model_selection                    # function like train_test_split
from sklearn import datasets                      # sklearn build in datasets
import sklearn.decomposition                      # PCA
import sklearn.feature_extraction                 # Define image and text datas
import sklearn.feature_selection                  # Find meaningful features


In [None]:
from sklearn.model_selection import train_test_split
X_iris, y_iris = iris.drop('species', axis=1), iris['species']
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris,
                                                random_state=1)

In [None]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(Xtrain, ytrain)                  # 3. fit model to data
y_model = model.predict(Xtest)             # 4. predict on new data

In [None]:
doc = sbol.Document()
doc.read(response.content)

for component in doc.componentDefinitions:
    print(f'Display ID: {component.displayId}')
    print(f'DNA Sequence: {component.sequence.elements}')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)