# PUMA: Promoter Unraveling through Machine-learning Algorithms

### _AI model to identify promoter sequences in existing databases_

### Dependencies

Here, we check if the dependencies are installed, and if they are not, ask the user if they want to install them

In [None]:
import subprocess
import sys

# List of dependencies
dependencies = ["pandas", "scikit-learn", "seaborn", "plotnine", "sbol_utilities"]

# Function to check if a module is installed
def is_module_installed(module_name):
    try:
        __import__(module_name)
        return True
    except ImportError:
        return False

# Function to install a module
def install_module(module_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])

# Check each dependency
for module in dependencies:
    if not is_module_installed(module):
        print(f"The module '{module}' is not installed.")
        answer = input(f"Do you want to install '{module}'? (yes/no): ")
        if answer.lower() == "yes":
            install_module(module)
            print(f"'{module}' has been installed.")
        else:
            print(f"'{module}' has not been installed.")
    else:
        print(f"The module '{module}' is already installed.")

We import the packages we need

In [None]:
import seaborn as sns
import getpass
import requests
#import sbol

### Accessing SynBioHub to retrieve the information

Then, we login into SynBioHub to retrieve the promoters' values. We ask the user for their credentials

In [None]:
response = requests.post(
    'https://synbiohub.org/login',
    headers={
        'Accept': 'text/plain'
    },
    data={
        'email': input('SynBioHub email: '),
        'password' : getpass.getpass('Password: '),
        },
)

print(response.status_code)
print(response.content)

Here we search for all the collections in the database

In [None]:
response = requests.get(
    'https://synbiohub.org/rootCollections',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

Here we look for a specific collection

In [None]:
response = requests.get(
    'https://synbiohub.org/user/carolusvitalis/iGEM_2019_Distribution_Kit_Promoters/iGEM_2019_Distribution_Kit_Promoters_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

In [None]:
import pandas as pd

# Search for DNA sequences using regular expression
dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', response.content.decode('utf-8'), re.DOTALL)

# Create a pandas dataframe
df = pd.DataFrame(dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
print(df)


Alternative version to run it using local SBOL files instead of SynBioHub

In [None]:
collection = open('/Users/carolusvitalis/Downloads/iGEM_2019_Distribution_Kit_Promoters_collection.xml').read()
print(collection)

In [None]:
# Search for DNA sequences using regular expression
dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', collection, re.DOTALL)

# Create a pandas dataframe
df = pd.DataFrame(dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
print(df)


The following code is for reference only

In [None]:
import sklearn.cluster                            # for clustering task
import sklearn.model_selection                    # function like train_test_split
from sklearn import datasets                      # sklearn build in datasets
import sklearn.decomposition                      # PCA
import sklearn.feature_extraction                 # Define image and text datas
import sklearn.feature_selection                  # Find meaningful features


In [None]:
from sklearn.model_selection import train_test_split
X_iris, y_iris = iris.drop('species', axis=1), iris['species']
Xtrain, Xtest, ytrain, ytest = train_test_split(X_iris, y_iris,
                                                random_state=1)

In [None]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class
model = GaussianNB()                       # 2. instantiate model
model.fit(Xtrain, ytrain)                  # 3. fit model to data
y_model = model.predict(Xtest)             # 4. predict on new data

In [None]:
doc = sbol.Document()
doc.read(response.content)

for component in doc.componentDefinitions:
    print(f'Display ID: {component.displayId}')
    print(f'DNA Sequence: {component.sequence.elements}')

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)