# PUMA: Promoter Unraveling through Machine-learning Algorithms
<img src="https://raw.githubusercontent.com/CarolusVitalis/PUMA/main/Images/PUMA_Logo.png" alt="PUMA Logo" width="200"/>


### _AI model to identify promoter sequences in existing databases_

### Dependencies

Here, we check if the dependencies are installed, and if they are not, ask the user if they want to install them

In [None]:
import subprocess
import sys

# List of dependencies
dependencies = ["pandas", "scikit-learn", "seaborn", "plotnine", "sbol_utilities"]

# Function to check if a module is installed
def is_module_installed(module_name):
    try:
        __import__(module_name)
        return True
    except ImportError:
        return False

# Function to install a module
def install_module(module_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])

# Check each dependency
for module in dependencies:
    if not is_module_installed(module):
        print(f"The module '{module}' is not installed.")
        answer = input(f"Do you want to install '{module}'? (yes/no): ")
        if answer.lower() == "yes":
            install_module(module)
            print(f"'{module}' has been installed.")
        else:
            print(f"'{module}' has not been installed.")
    else:
        print(f"The module '{module}' is already installed.")

We import the packages we need

In [None]:
import seaborn as sns
import getpass
import requests
import re
import sbol3
import pandas as pd

### Accessing SynBioHub to retrieve the information

Then, we login into SynBioHub to retrieve the promoters' values. We ask the user for their credentials

In [None]:
response = requests.post(
    'https://synbiohub.org/login',
    headers={
        'Accept': 'text/plain'
    },
    data={
        'email': input('SynBioHub email: '),
        'password' : getpass.getpass('Password: '),
        },
)

print(response.status_code)
print(response.content)

Here we search for all the collections in the database

In [None]:
response = requests.get(
    'https://synbiohub.org/rootCollections',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

Here we look for a specific collection

In [None]:
response = requests.get(
    'https://synbiohub.org/user/carolusvitalis/iGEM_2019_Distribution_Kit_Promoters/iGEM_2019_Distribution_Kit_Promoters_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

In [None]:
import pandas as pd

# Search for DNA sequences using regular expression
dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', response.content.decode('utf-8'), re.DOTALL)

# Create a pandas dataframe
df = pd.DataFrame(dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
print(df)


#### Alternative version to run it using local SBOL files instead of SynBioHub


## Load in training data

In [None]:
promoters_2019 = open('./SBOL_Files/iGEM_2019_Promoters_collection.xml').read()

rbs_2019 = open('./SBOL_Files/iGEM_2019_RBSs_collection.xml').read()

# Search for DNA sequences using regular expression
p19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_2019, re.DOTALL)

# Create a pandas dataframe
p19_df = pd.DataFrame(p19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(p19_df))
# Print the dataframe
#print(p19_df)

rbs19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', rbs_2019, re.DOTALL)
rbs19_df = pd.DataFrame(rbs19_dna_sequences, columns=['Name', 'DNA Sequence'])
#print(rbs19_df)
print(len(rbs19_df))

# why is the first name of the df the name of the collection? 
# for now, will remove first row so that we have realistic names
p19_df = p19_df.drop(p19_df.index[0])

# add in promoter label for these seqs
p19_df['Element'] = "Promoter"

rbs19_df = rbs19_df.drop(rbs19_df.index[0])
rbs19_df['Element'] = "RBS"


## Load in target dataset to apply model

In [None]:
promoters_toValidate = open('./SBOL_Files/iGEM_Promoters_collection.xml').read()

allPromoters_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_toValidate, re.DOTALL)

# Create a pandas dataframe
allPromoters_df = pd.DataFrame(allPromoters_dna_sequences, columns=['Name', 'DNA Sequence'])
allPromoters_df = allPromoters_df.drop(allPromoters_df.index[0])
print(len(allPromoters_df))

## Concatenate promoter and RBS dataframes for training

In [None]:
promoters_rbs_df = pd.concat([rbs19_df, p19_df], ignore_index=True)
print(promoters_rbs_df)


## 1. Vectorizing sequences into k-mers

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Use CountVectorizer to initialize k-mer splitting 
# using k=4 for first pass
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))

# apply the vectorizer to the concat. dataframe
X = vectorizer.fit_transform(promoters_rbs_df['DNA Sequence']).toarray()
y = promoters_rbs_df['Element']

## 2. Splitting training and test set

In [None]:
# these parameters are standard, but we can tweak if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## 3. Train the model

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)

## 4. Validate

In [None]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


## 5. Apply to full dataset to classify promoters

In [None]:
X_new = vectorizer.transform(allPromoters_df['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_new = model.predict(X_new)

# Add predictions to the new DataFrame
allPromoters_df['PredictedType'] = predictions_new

# Filter to identify sequences predicted as promoters
valid_promoters_df = allPromoters_df[allPromoters_df['PredictedType'] == 'Promoter']
print(len(allPromoters_df))
print(len(valid_promoters_df))


In [None]:
training_nolabels = promoters_rbs_df.drop('Element', axis=1)

X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(promoters_rbs_df))
print(len(valid_promoters_old_df))

In [None]:
training_nolabels = promoters_rbs_df.drop('Element', axis=1)

X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(promoters_rbs_df))
print(len(valid_promoters_old_df))