# PUMA: Promoter Unraveling through Machine-learning Algorithms
<img src="https://raw.githubusercontent.com/CarolusVitalis/PUMA/main/Images/PUMA_Logo.png" alt="PUMA Logo" width="200"/>


### _AI model to identify promoter sequences in existing databases_

### Dependencies

Here, we check if the dependencies are installed, and if they are not, ask the user if they want to install them

In [None]:
import subprocess
import sys

# List of dependencies
dependencies = ["pandas", "scikit-learn", "seaborn", "plotnine"] #, "sbol_utilities"]

# Function to check if a module is installed
def is_module_installed(module_name):
    try:
        __import__(module_name)
        return True
    except ImportError:
        return False

# Function to install a module
def install_module(module_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", module_name])

# Check each dependency
for module in dependencies:
    if not is_module_installed(module):
        print(f"The module '{module}' is not installed.")
        answer = input(f"Do you want to install '{module}'? (yes/no): ")
        if answer.lower() == "yes":
            install_module(module)
            print(f"'{module}' has been installed.")
        else:
            print(f"'{module}' has not been installed.")
    else:
        print(f"The module '{module}' is already installed.")

We import the packages we need

In [None]:
import seaborn as sns
import getpass
import requests
import re
# import sbol3
import pandas as pd

### Accessing SynBioHub to retrieve the information
Here we login into SynBioHub to retrieve the promoters' values. We ask the user for their credentials

In [None]:
response = requests.post(
    'https://synbiohub.org/login',
    headers={
        'Accept': 'text/plain'
    },
    data={
        'email': input('SynBioHub email: '),
        'password' : getpass.getpass('Password: '),
        },
)

print(response.status_code)
print(response.content)

Here we search for all the collections in the database

In [None]:
response = requests.get(
    'https://synbiohub.org/rootCollections',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

Here we look for a specific collection

In [None]:
response = requests.get(
    'https://synbiohub.org/user/carolusvitalis/iGEM_2019_Distribution_Kit_Promoters/iGEM_2019_Distribution_Kit_Promoters_collection/1/sbol',
    headers={
        'Accept': 'text/plain',
        'X-authorization': response.content
        },
)

print(response.status_code)
print(response.content)

In [None]:
import pandas as pd

# Search for DNA sequences using regular expression
dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', response.content.decode('utf-8'), re.DOTALL)

# Create a pandas dataframe
df = pd.DataFrame(dna_sequences, columns=['Name', 'DNA Sequence'])

# Print the dataframe
print(df)


### SynBioHub (iGEM Collection)
This version uses local SBOL files from the 2019 iGEM Distribution

In [None]:
promoters_2019 = open('./SBOL_Files/iGEM_2019_Promoters_collection.xml').read()

rbs_2019 = open('./SBOL_Files/iGEM_2019_RBSs_collection.xml').read()

cds_2019 = open('./SBOL_Files/iGEM_2019_CDS_collection.xml').read()

t_2019 = open('./SBOL_Files/iGEM_2019_Terminators_collection.xml').read()


# Search for DNA sequences using regular expression
# Promoters
p19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_2019, re.DOTALL)

# Create a pandas dataframe
p19_df = pd.DataFrame(p19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(p19_df))
# Print the dataframe
#print(p19_df)

# RBS
rbs19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', rbs_2019, re.DOTALL)
rbs19_df = pd.DataFrame(rbs19_dna_sequences, columns=['Name', 'DNA Sequence'])
#print(rbs19_df)
print(len(rbs19_df))

# CDS
cds19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', cds_2019, re.DOTALL)
cds19_df = pd.DataFrame(cds19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(cds19_df))

# Terminators
t19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', t_2019, re.DOTALL)
t19_df = pd.DataFrame(t19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(t19_df))

# why is the first name of the df the name of the collection? 
# for now, will remove first row so that we have realistic names
p19_df = p19_df.drop(p19_df.index[0])

# add in promoter label for these seqs
p19_df['Element'] = "Promoter"

rbs19_df = rbs19_df.drop(rbs19_df.index[0])
rbs19_df['Element'] = "RBS"

cds19_df = cds19_df.drop(cds19_df.index[0])
cds19_df['Element'] = "CDS"

t19_df = t19_df.drop(t19_df.index[0])
t19_df['Element'] = "Terminator"

Load in target dataset to apply model

In [None]:
promoters_toValidate = open('./SBOL_Files/iGEM_Promoters_collection.xml').read()

allPromoters_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_toValidate, re.DOTALL)

# Create a pandas dataframe
allPromoters_df = pd.DataFrame(allPromoters_dna_sequences, columns=['Name', 'DNA Sequence'])
allPromoters_df = allPromoters_df.drop(allPromoters_df.index[0])
print(len(allPromoters_df))

Concatenate promoter and RBS dataframes for training

In [None]:
pro_rbs_cds_ter_df = pd.concat([p19_df, rbs19_df, cds19_df, t19_df], ignore_index=True)
print(pro_rbs_cds_ter_df)


1. Vectorizing sequences into k-mers

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Use CountVectorizer to initialize k-mer splitting 
# using k=4 for first pass
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))

# apply the vectorizer to the concat. dataframe
X = vectorizer.fit_transform(pro_rbs_cds_ter_df['DNA Sequence']).toarray()
y = pro_rbs_cds_ter_df['Element']

2. Splitting training and test set

In [None]:
# these parameters are standard, but we can tweak if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

3. Train the model

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)

4. Validate

In [None]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


5. Apply to full dataset to classify promoters

In [None]:
X_new = vectorizer.transform(allPromoters_df['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_new = model.predict(X_new)

# Add predictions to the new DataFrame
allPromoters_df['PredictedType'] = predictions_new

# Filter to identify sequences predicted as promoters
valid_promoters_df = allPromoters_df[allPromoters_df['PredictedType'] == 'Promoter']
print(len(allPromoters_df))
print(len(valid_promoters_df))

# Filter to identify sequences predicted as non-promoters
non_promoters_df = allPromoters_df[allPromoters_df['PredictedType'] != 'Promoter']
print(len(non_promoters_df))
print(non_promoters_df)



In [None]:
training_nolabels = pro_rbs_cds_ter_df.drop('Element', axis=1)

X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(pro_rbs_cds_ter_df))
print(len(valid_promoters_old_df))

# Filter to identify sequences predicted as non-promoters
non_promoters_df = training_nolabels[training_nolabels['PredictedType'] != 'Promoter']
print(len(non_promoters_df))
print(non_promoters_df)


### SynBioHub (iGEM Collection) + RegulonDB
This version uses both local SBOL files from the 2019 iGEM Distribution, and files from the RegulonDB.

In [None]:
import pandas as pd

pro_df = pd.read_csv('./RegulonDB_Files/promoters_Data.csv')
ter_df = pd.read_csv('./RegulonDB_Files/terminators_Data.csv')

print(pro_df)
print(ter_df)

pro_df = pro_df.drop(pro_df.index[0])
ter_df = ter_df.drop(ter_df.index[0])

pro_df['Element'] = "Promoter"
ter_df['Element'] = "Terminator"

print(pro_df)
print(ter_df)

promoters_2019 = open('./SBOL_Files/iGEM_2019_Promoters_collection.xml').read()

rbs_2019 = open('./SBOL_Files/iGEM_2019_RBSs_collection.xml').read()

cds_2019 = open('./SBOL_Files/iGEM_2019_CDS_collection.xml').read()

t_2019 = open('./SBOL_Files/iGEM_2019_Terminators_collection.xml').read()


# Search for DNA sequences using regular expression
# Promoters
p19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', promoters_2019, re.DOTALL)

# Create a pandas dataframe
p19_df = pd.DataFrame(p19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(p19_df))
# Print the dataframe
#print(p19_df)

# RBS
rbs19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', rbs_2019, re.DOTALL)
rbs19_df = pd.DataFrame(rbs19_dna_sequences, columns=['Name', 'DNA Sequence'])
#print(rbs19_df)
print(len(rbs19_df))

# CDS
cds19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', cds_2019, re.DOTALL)
cds19_df = pd.DataFrame(cds19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(cds19_df))

# Terminators
t19_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', t_2019, re.DOTALL)
t19_df = pd.DataFrame(t19_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(t19_df))

# why is the first name of the df the name of the collection? 
# for now, will remove first row so that we have realistic names
p19_df = p19_df.drop(p19_df.index[0])

# add in promoter label for these seqs
p19_df['Element'] = "Promoter"

rbs19_df = rbs19_df.drop(rbs19_df.index[0])
rbs19_df['Element'] = "RBS"

cds19_df = cds19_df.drop(cds19_df.index[0])
cds19_df['Element'] = "CDS"

t19_df = t19_df.drop(t19_df.index[0])
t19_df['Element'] = "Terminator"

In [None]:
pro_ter_df = pd.concat([pro_df, p19_df, rbs19_df, cds19_df, t19_df, ter_df], ignore_index=True)
pro_ter_df = pro_ter_df.dropna(subset=['DNA Sequence'])
print(pro_ter_df)

In [None]:
'''
import os

output_dir = "./csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

file_path = os.path.join(output_dir, "pro_ter.csv")
pro_ter_df.to_csv(file_path, index=False)
print(f"Exported DataFrame as CSV: {file_path}")
'''

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Ask the user for the size of the k-mers
k_mer_size = int(input("Enter the size of the k-mers: "))

# Use CountVectorizer to initialize k-mer splitting 
# Use the user's input as the ngram_range parameter
vectorizer = CountVectorizer(analyzer='char', ngram_range=(k_mer_size, k_mer_size))

# apply the vectorizer to the concat. dataframe
X = vectorizer.fit_transform(pro_ter_df['DNA Sequence']).toarray()
y = pro_ter_df['Element']

In [None]:
# these parameters are standard, but we can tweak if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


In [None]:
X_new = vectorizer.transform(p19_df['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_new = model.predict(X_new)

# Add predictions to the new DataFrame
p19_df['PredictedType'] = predictions_new

# Filter to identify sequences predicted as promoters
valid_promoters_df = p19_df[p19_df['PredictedType'] == 'Promoter']
print(len(p19_df))
print(len(valid_promoters_df))
print("Valid Promoters")
print(valid_promoters_df)

print("Non-Promoters")
# Filter to identify sequences predicted as non-promoters
non_promoters_df = p19_df[p19_df['PredictedType'] != 'Promoter']
print(len(non_promoters_df))
print(non_promoters_df)




In [None]:
training_nolabels = pro_ter_df.drop('Element', axis=1)

X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(pro_ter_df))
print(len(valid_promoters_old_df))

In [None]:
training_nolabels = pro_df.drop('Element', axis=1)
training_nolabels = training_nolabels.dropna(subset=['DNA Sequence'])


X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(pro_df))
print(len(valid_promoters_old_df))

### iGEM Repository + RegulonDB
This version uses local csv files from the iGEM Repository and the RegulonDB.

In [None]:
import pandas as pd

#RegulonDB Files
pro_df = pd.read_csv('./RegulonDB_Files/promoters_Data.csv')
ter_df = pd.read_csv('./RegulonDB_Files/terminators_Data.csv')
gen_df = pd.read_csv('./RegulonDB_Files/genes_Data.csv')

print(pro_df)
print(ter_df)
print(gen_df)

# pro_df = pro_df.drop(pro_df.index[0])
# ter_df = ter_df.drop(ter_df.index[0])

pro_df['Element'] = "Promoter"
ter_df['Element'] = "Terminator"
gen_df['Element'] = "Gene"

print(pro_df)
print(ter_df)
print(gen_df)

#iGEM Repository Files
pro_2023 = pd.read_csv('./csv/Promoter_Part_Sequences.csv')
rbs_2023 = pd.read_csv('./csv/RBS_Part_Sequences.csv')
cds_2023 = pd.read_csv('./csv/CDS_Part_Sequences.csv')
ter_2023 = pd.read_csv('./csv/Terminator_Part_Sequences.csv')
bkb_2023 = pd.read_csv('./csv/Backbone_Part_Sequences.csv')

print(pro_2023)
print(rbs_2023)
print(cds_2023)
print(ter_2023)
print(bkb_2023)

# pro_2023 = pro_2023.drop(pro_2023.index[0])
# rbs_2023 = rbs_2023.drop(rbs_2023.index[0])
# cds_2023 = cds_2023.drop(cds_2023.index[0])
# ter_2023 = ter_2023.drop(ter_2023.index[0])
# bkb_2023 = bkb_2023.drop(bkb_2023.index[0])

pro_2023['Element'] = "Promoter"
rbs_2023['Element'] = "RBS"
cds_2023['Element'] = "CDS"
ter_2023['Element'] = "Terminator"
bkb_2023['Element'] = "Backbone"

In [None]:
all_parts_df = pd.concat([pro_df, ter_df, pro_2023, rbs_2023, cds_2023, ter_2023, bkb_2023], ignore_index=True)
all_parts_df = all_parts_df.dropna(subset=['DNA Sequence'])
print(all_parts_df)

In [None]:
'''
import os

output_dir = "./csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

file_path = os.path.join(output_dir, "all_parts.csv")
all_parts_df.to_csv(file_path, index=False)
print(f"Exported DataFrame as CSV: {file_path}")
'''

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Ask the user for the size of the k-mers
k_mer_size = int(input("Enter the size of the k-mers: "))

# Use CountVectorizer to initialize k-mer splitting 
# Use the user's input as the ngram_range parameter
vectorizer = CountVectorizer(analyzer='char', ngram_range=(k_mer_size, k_mer_size))

# apply the vectorizer to the concat. dataframe
X = vectorizer.fit_transform(all_parts_df['DNA Sequence']).toarray()
y = all_parts_df['Element']

In [None]:
# these parameters are standard, but we can tweak if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


In [None]:
X_new = vectorizer.transform(pro_2023['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_new = model.predict(X_new)

# Add predictions to the new DataFrame
pro_2023['PredictedType'] = predictions_new

# Filter to identify sequences predicted as promoters
valid_promoters_df = pro_2023[pro_2023['PredictedType'] == 'Promoter']
print(len(pro_2023))
print(len(valid_promoters_df))
print("Valid Promoters")
print(valid_promoters_df)

print("Non-Promoters")
# Filter to identify sequences predicted as non-promoters
non_promoters_df = pro_2023[pro_2023['PredictedType'] != 'Promoter']
print(len(non_promoters_df))
print(non_promoters_df)

In [None]:
training_nolabels = all_parts_df.drop('Element', axis=1)

X_old = vectorizer.transform(training_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
training_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] == 'Promoter']
print(len(all_parts_df))
print(len(valid_promoters_old_df))
print("Valid Promoters")
print(valid_promoters_old_df)

print("Non-Promoters")
# Filter to identify sequences predicted as non-promoters
non_promoters_old_df = training_nolabels[training_nolabels['PredictedType'] != 'Promoter']
print(len(non_promoters_old_df))
print(non_promoters_old_df)

In [None]:
igem_pro_file = open('./SBOL_Files/iGEM_Promoters_collection.xml').read()

# Search for DNA sequences using regular expression
igem_pro_dna_sequences = re.findall(r'<sbol:displayId>(.*?)_sequence</sbol:displayId>.*?<sbol:elements>(.*?)</sbol:elements>', igem_pro_file, re.DOTALL)

# Create a pandas dataframe
igem_pro_df = pd.DataFrame(igem_pro_dna_sequences, columns=['Name', 'DNA Sequence'])
print(len(igem_pro_df))
# Print the dataframe
#print(igem_pro_df)

# why is the first name of the df the name of the collection? 
# for now, will remove first row so that we have realistic names
igem_pro_df = igem_pro_df.drop(igem_pro_df.index[0])

# add in promoter label for these seqs
igem_pro_df['Element'] = "Promoter"

In [None]:
igem_pro_nolabels = igem_pro_df.drop('Element', axis=1)
igem_pro_nolabels = igem_pro_nolabels.dropna(subset=['DNA Sequence'])


X_old = vectorizer.transform(igem_pro_nolabels['DNA Sequence']).toarray()

# Use the trained model to predict the sequence types
predictions_old = model.predict(X_old)

# Add predictions to the new DataFrame
igem_pro_nolabels['PredictedType'] = predictions_old

# Filter to identify sequences predicted as promoters
valid_igem_promoters_df = igem_pro_nolabels[igem_pro_nolabels['PredictedType'] == 'Promoter']
print(len(igem_pro_df))
print(len(valid_igem_promoters_df))
print("Valid Promoters")
print(valid_igem_promoters_df)

print("Non-Promoters")
# Filter to identify sequences predicted as non-promoters
non_promoters_igem_df = igem_pro_nolabels[igem_pro_nolabels['PredictedType'] != 'Promoter']
print(len(non_promoters_igem_df))
print(non_promoters_igem_df)

In [None]:
import os

output_dir = "./csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

file_path = os.path.join(output_dir, "igem_non_promoters.csv")
non_promoters_igem_df.to_csv(file_path, index=False)
print(f"Exported DataFrame as CSV: {file_path}")