In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, Input, GaussianNoise
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt



In [2]:
# Load CSV files
gene_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Gene.csv')
mainsheet_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Mainset.csv')
product_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Product.csv')
promoter_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')
species_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Species.csv')

In [3]:
# Display first few rows of each dataset
print("Gene Data:")
print(gene_df.head())

print("\nMainSheet Data:")
print(mainsheet_df.head())

print("\nProduct Data:")
print(product_df.head())

print("\nPromoter Data:")
print(promoter_df.head())

print("\nSpecies Data:")
print(species_df.head())

Gene Data:
   GeneID GeneName GeneSynonyms  GenePositionStart  GenePositionEnd  \
0       1     acrD  b2470, yffA            2585617          2588730   
1       2     actP  b4067, yjcG            4282925          4281276   
2       3     hisP        b2306            2422531          2421758   
3       4    hisM         b2307            2423255          2422539   
4       5    hisQ         b2308            2423938          2423252   

  GeneStrand  GeneNCBI  ProductID  
0          +  945464.0          1  
1          -  948575.0          2  
2          +  946789.0          3  
3          -  946790.0          4  
4          -  947235.0          5  

MainSheet Data:
   MainsetID  PromoterID  GeneID  SpeciesID
0          1           1       1          1
1          2           2       2          1
2          3           3       3          1
3          4           3       4          1
4          5           3       5          1

Product Data:
   ProductID ProductType                          

In [4]:
# Drop NaN values and ensure sequences are strings
promoter_df = promoter_df.dropna(subset=["PromoterSequence"])
promoter_df["PromoterSequence"] = promoter_df["PromoterSequence"].astype(str)

# Extract promoter sequences
promoter_sequences = promoter_df["PromoterSequence"].values

# Check sequence lengths
sequence_lengths = [len(seq) for seq in promoter_sequences]

# Print first 5 sequences and their lengths
for i in range(5):
    print(f"Sequence {i+1}: {promoter_sequences[i]} (Length: {sequence_lengths[i]})")

Sequence 1: atttacattaactcctttttttctccacgattggctcgtaccttgccgctacagtgaagcAagtcaagcctacaacgatac (Length: 81)
Sequence 2: gatctcctttgttctcaccggtatctacatctggcgggcgaacggcgaattcgaccgtctTaataatgaagtcctgcatga (Length: 81)
Sequence 3: tatttaacgttgaatgttactgttgtcgtcaagatggcataagacctgcatgaaagagccTgcaaacacacaacacaatac (Length: 81)
Sequence 4: aaaaagcagtatttcggcgagtagcgcagcttggtagcgcaactggtttgggaccagtggGtcggaggttcgaatcctctc (Length: 81)
Sequence 5: tagcctccgccgtttatgcacttttatcactggctggcacgaaccctgcaatctacatttAcagcgcaaacattacttatt (Length: 81)


In [5]:
# Load CSV file
promoter_df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')

# Step 1: Clean the Promoter Sequences
# Remove NaN values
promoter_df = promoter_df.dropna(subset=["PromoterSequence"])

# Ensure sequences are strings and uppercase
promoter_df["PromoterSequence"] = promoter_df["PromoterSequence"].astype(str).str.upper()

# Ensure sequences are exactly 81 base pairs long (truncate or pad with 'N')
def fix_sequence(seq, length=81):
    seq = seq[:length]  # Truncate if too long
    seq = seq.ljust(length, 'N')  # Pad with 'N' if too short
    return seq

promoter_df["PromoterSequence"] = promoter_df["PromoterSequence"].apply(fix_sequence)

# Step 2: One-Hot Encoding Function
def one_hot_encode(seq):
    seq = seq.upper()  # Ensure uppercase
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'N': [0, 0, 0, 0]}
    return np.array([mapping.get(nuc, [0, 0, 0, 0]) for nuc in seq])

# Step 3: Apply One-Hot Encoding to Each Row
promoter_df["OneHotEncoded"] = promoter_df["PromoterSequence"].apply(lambda seq: one_hot_encode(seq).flatten().tolist())

# Step 4: Save the Updated CSV File (Overwrite Existing)
promoter_df.to_csv("promoter.csv", index=False)

print("✅ One-hot encoding added as a new column 'OneHotEncoded' in 'promoter.csv'")
print("First 5 rows of the updated DataFrame:")
print(promoter_df.head())


✅ One-hot encoding added as a new column 'OneHotEncoded' in 'promoter.csv'
First 5 rows of the updated DataFrame:
   PromoterID PromoterName  PromoterPosition PromoterStrand  \
0           1       acrDp2           2585572              +   
1           2        actPp           4282958              -   
2           3        argTp           2425871              +   
3           4         aslB           3980804              +   
4           5       astCp2           1830068              +   

                                    PromoterSequence PromoterMotifa  \
0  ATTTACATTAACTCCTTTTTTTCTCCACGATTGGCTCGTACCTTGC...         tggctc   
1  GATCTCCTTTGTTCTCACCGGTATCTACATCTGGCGGGCGAACGGC...         tggcgg   
2  TATTTAACGTTGAATGTTACTGTTGTCGTCAAGATGGCATAAGACC...         tggcat   
3  AAAAAGCAGTATTTCGGCGAGTAGCGCAGCTTGGTAGCGCAACTGG...         tggtag   
4  TAGCCTCCGCCGTTTATGCACTTTTATCACTGGCTGGCACGAACCC...         tggcac   

  PromoterMotifb                                   BriefDescription  \
0        

In [6]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization, GlobalMaxPooling1D, GaussianNoise
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from xgboost import XGBClassifier


In [7]:


# === Load and preprocess ===
df = pd.read_csv("promoter.csv")
df["OneHotEncoded"] = df["OneHotEncoded"].apply(ast.literal_eval)
df["Label"] = (df["PromoterStrand"] == "+").astype(int)

# === Features ===
X_flat = np.array(df["OneHotEncoded"].tolist())
X_seq = X_flat.reshape(len(X_flat), 81, 4)
y = df["Label"].values

# === Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2, random_state=42, stratify=y)

# === Class weights to handle imbalance ===
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# === Improved CNN model ===
def build_cnn():
    model = Sequential([
        GaussianNoise(0.1, input_shape=(81, 4)),
        Conv1D(128, 7, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),

        Conv1D(256, 5, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(2),
        Dropout(0.3),

        Conv1D(512, 3, activation='relu', padding='same'),
        BatchNormalization(),
        GlobalMaxPooling1D(),

        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# === Callbacks for training optimization ===
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)

# === Build & Train ===
cnn = build_cnn()
history = cnn.fit(
    X_train, y_train,
    epochs=40,
    batch_size=16,
    validation_split=0.2,
    class_weight=class_weights_dict,
    callbacks=[reduce_lr, early_stop],
    verbose=1
)

# === Evaluate ===
y_pred = cnn.predict(X_test).round()
print("\n📊 CNN Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



cnn.save("cnn_model.h5")


Epoch 1/40


  super().__init__(**kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.4470 - loss: 1.4230 - val_accuracy: 0.5000 - val_loss: 0.6933 - learning_rate: 5.0000e-04
Epoch 2/40
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4567 - loss: 1.4819 - val_accuracy: 0.5588 - val_loss: 0.6885 - learning_rate: 5.0000e-04
Epoch 3/40
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5701 - loss: 0.9566 - val_accuracy: 0.6765 - val_loss: 0.6871 - learning_rate: 5.0000e-04
Epoch 4/40
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4970 - loss: 0.8981 - val_accuracy: 0.6471 - val_loss: 0.6906 - learning_rate: 5.0000e-04
Epoch 5/40
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5447 - loss: 0.9137 - val_accuracy: 0.6765 - val_loss: 0.6891 - learning_rate: 5.0000e-04
Epoch 6/40
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



📊 CNN Results
Accuracy: 0.5952380952380952
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.60      1.00      0.75        25

    accuracy                           0.60        42
   macro avg       0.30      0.50      0.37        42
weighted avg       0.35      0.60      0.44        42



In [8]:


# Flatten CNN input shape (81x4 → 324)
X_flat = X_seq.reshape(len(X_seq), -1)

# Train-test split
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_flat, y, test_size=0.2, random_state=42, stratify=y)

# Models
models = {
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    acc = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred, output_dict=True)
    results[name] = {
        "Accuracy": round(acc * 100, 2),
        "Precision_Pos": round(report["1"]["precision"], 2),
        "Recall_Pos": round(report["1"]["recall"], 2),
        "F1_Pos": round(report["1"]["f1-score"], 2)
    }

# Display comparison
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)




               Accuracy  Precision_Pos  Recall_Pos  F1_Pos
MLP               50.00           0.57        0.68    0.62
Random Forest     52.38           0.57        0.84    0.68
AdaBoost          54.76           0.61        0.68    0.64
SVM               57.14           0.59        0.96    0.73
XGBoost           59.52           0.62        0.80    0.70


In [9]:


# Load and inspect Promoter.csv
df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')

# Drop any row with missing PromoterSequence or Motifa
df = df.dropna(subset=["PromoterSequence", "PromoterMotifa"])

# Check label cardinality
label_counts = df["PromoterMotifa"].value_counts()
print("Unique Labels:", len(label_counts))
print(label_counts.head())

# Filter to top 5 frequent labels (optional, for sanity)
top_labels = label_counts.head(5).index.tolist()
df = df[df["PromoterMotifa"].isin(top_labels)]

# Prepare sequences and labels
sequences = df["PromoterSequence"].astype(str).str.upper().tolist()
labels = df["PromoterMotifa"].astype(str).tolist()

# Encode labels
le = LabelEncoder()
y = le.fit_transform(labels)

# Tokenize promoter sequences at character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sequences)
X = tokenizer.texts_to_sequences(sequences)
X = pad_sequences(X, maxlen=100, padding='post')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build CNN model
model = Sequential([
    Input(shape=(100,)),
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(len(np.unique(y)), activation='softmax')
])

# Compile and train
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, batch_size=16, validation_split=0.1)

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print("✅ Fixed Accuracy:", round(acc, 4))


Unique Labels: 57
PromoterMotifa
tggcac    74
tggcat    36
tggcaa     9
tggcgt     8
tggcgc     6
Name: count, dtype: int64
Epoch 1/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.4247 - loss: 1.5028 - val_accuracy: 0.8182 - val_loss: 0.8270
Epoch 2/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5037 - loss: 1.3043 - val_accuracy: 0.8182 - val_loss: 0.6380
Epoch 3/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4287 - loss: 1.3059 - val_accuracy: 0.8182 - val_loss: 0.8350
Epoch 4/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5512 - loss: 1.1623 - val_accuracy: 0.8182 - val_loss: 0.7983
Epoch 5/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5617 - loss: 1.1395 - val_accuracy: 0.8182 - val_loss: 0.7049
Epoch 6/15
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - a

In [10]:


# Load the dataset
df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')

# Drop rows with missing sequences or motifs
df = df.dropna(subset=["PromoterSequence", "PromoterMotifa"])

# Count frequency of each motif
motif_counts = df["PromoterMotifa"].value_counts().reset_index()
motif_counts.columns = ["Motif", "Count"]

# Save to CSV
motif_counts.to_csv("motif_counts.csv", index=False)

print("✅ Motif summary saved as 'motif_counts.csv'")


✅ Motif summary saved as 'motif_counts.csv'


In [11]:


# Use all 210 sequences and motifs
sequences = df["PromoterSequence"].astype(str).str.upper().tolist()
labels = df["PromoterMotifa"].astype(str).tolist()

# Encode motif labels as integers (0 to N-1)
le = LabelEncoder()
y = le.fit_transform(labels)

# Optional: Check number of classes
print("Total sequences:", len(sequences))
print("Total motif classes:", len(np.unique(y)))


Total sequences: 209
Total motif classes: 57


In [12]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights to balance rare vs frequent motifs
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Re-train model using class weights
model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=16,
    validation_split=0.1,
    class_weight=class_weights_dict,  # 🔥 this is the key line
    verbose=2
)

# Re-evaluate
loss, acc = model.evaluate(X_test, y_test)
print("✅ CNN Accuracy with Class Weights:", round(acc, 4))

Epoch 1/40
6/6 - 0s - 40ms/step - accuracy: 0.9263 - loss: 0.5434 - val_accuracy: 0.7273 - val_loss: 0.7400
Epoch 2/40
6/6 - 0s - 8ms/step - accuracy: 0.9789 - loss: 0.3090 - val_accuracy: 0.7273 - val_loss: 0.8102
Epoch 3/40
6/6 - 0s - 8ms/step - accuracy: 0.9895 - loss: 0.1931 - val_accuracy: 0.7273 - val_loss: 0.8299
Epoch 4/40
6/6 - 0s - 7ms/step - accuracy: 0.9789 - loss: 0.2499 - val_accuracy: 0.7273 - val_loss: 0.9411
Epoch 5/40
6/6 - 0s - 7ms/step - accuracy: 1.0000 - loss: 0.1164 - val_accuracy: 0.7273 - val_loss: 0.9341
Epoch 6/40
6/6 - 0s - 7ms/step - accuracy: 1.0000 - loss: 0.1004 - val_accuracy: 0.7273 - val_loss: 0.9383
Epoch 7/40
6/6 - 0s - 7ms/step - accuracy: 0.9684 - loss: 0.1608 - val_accuracy: 0.7273 - val_loss: 1.0482
Epoch 8/40
6/6 - 0s - 7ms/step - accuracy: 0.9895 - loss: 0.1542 - val_accuracy: 0.7273 - val_loss: 1.0005
Epoch 9/40
6/6 - 0s - 7ms/step - accuracy: 0.9895 - loss: 0.0313 - val_accuracy: 0.7273 - val_loss: 0.9114
Epoch 10/40
6/6 - 0s - 8ms/step - ac

In [13]:
# use all 210 sequences + 58 motifs 


#  Load promoter dataset
df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')
df = df.dropna(subset=["PromoterSequence", "PromoterMotifa"])


#  Extract sequences and labels
sequences = df["PromoterSequence"].astype(str).str.upper().tolist()
labels = df["PromoterMotifa"].astype(str).tolist()

# Encode motifs
le = LabelEncoder()
y = le.fit_transform(labels)
num_classes = len(np.unique(y))

#  Tokenize sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sequences)
X = tokenizer.texts_to_sequences(sequences)
X = pad_sequences(X, maxlen=100, padding='post')

#  Stratified split – this may break if any motif only appears once!
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
except ValueError as e:
    print("⚠️ Stratified split failed. Falling back to regular split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

#  Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

#  Build CNN
model = Sequential([
    Input(shape=(100,)),
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32),
    Conv1D(64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#  Train with class weights
model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=16,
    validation_split=0.1,
    class_weight=class_weights_dict,
    verbose=2
)

#  Evaluate
loss, acc = model.evaluate(X_test, y_test)
print("✅ CNN Accuracy (All 58 Motifs — No Filtering):", round(acc, 4))



⚠️ Stratified split failed. Falling back to regular split.
Epoch 1/30
10/10 - 0s - 49ms/step - accuracy: 0.2467 - loss: 10.1453 - val_accuracy: 0.4706 - val_loss: 3.1915
Epoch 2/30
10/10 - 0s - 6ms/step - accuracy: 0.3533 - loss: 8.1956 - val_accuracy: 0.4706 - val_loss: 2.9502
Epoch 3/30
10/10 - 0s - 6ms/step - accuracy: 0.3600 - loss: 7.4936 - val_accuracy: 0.4706 - val_loss: 2.9846
Epoch 4/30
10/10 - 0s - 6ms/step - accuracy: 0.2800 - loss: 7.2476 - val_accuracy: 0.4706 - val_loss: 3.0081
Epoch 5/30
10/10 - 0s - 5ms/step - accuracy: 0.3067 - loss: 7.0225 - val_accuracy: 0.4706 - val_loss: 2.9158
Epoch 6/30
10/10 - 0s - 6ms/step - accuracy: 0.3467 - loss: 6.7651 - val_accuracy: 0.4706 - val_loss: 2.9780
Epoch 7/30
10/10 - 0s - 6ms/step - accuracy: 0.3667 - loss: 6.6439 - val_accuracy: 0.4706 - val_loss: 2.9936
Epoch 8/30
10/10 - 0s - 6ms/step - accuracy: 0.3800 - loss: 6.3404 - val_accuracy: 0.4706 - val_loss: 2.9969
Epoch 9/30
10/10 - 0s - 6ms/step - accuracy: 0.4133 - loss: 5.7784 

In [14]:

from xgboost import XGBClassifier
#  Load and clean promoter data
df = pd.read_csv('/Users/bindusamba/Documents/GitHub/Master-s-project/csv/Promoter.csv')
df = df.dropna(subset=["PromoterSequence", "PromoterMotifa"])

#  Filter motifs that appear at least twice
motif_counts = df["PromoterMotifa"].value_counts()
valid_motifs = motif_counts[motif_counts > 1].index
df = df[df["PromoterMotifa"].isin(valid_motifs)]

#  Prepare sequences and labels
sequences = df["PromoterSequence"].astype(str).str.upper().tolist()
labels = df["PromoterMotifa"].astype(str).tolist()

#  Encode motif labels
le = LabelEncoder()
y = le.fit_transform(labels)

#  Extract 2-mer features
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=500)
X = vectorizer.fit_transform(sequences).toarray()

#  Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#  Define models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss'),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42)
}

#  Train and evaluate all models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results.append((name, round(acc, 4)))
    print(f"✅ {name} Accuracy: {round(acc, 4)}")

#  Save accuracy results
results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
results_df.to_csv("motif_model_comparison.csv", index=False)
print("📁 Model results saved to: motif_model_comparison.csv")


✅ SVM Accuracy: 0.3143
✅ Random Forest Accuracy: 0.5143




✅ MLP Accuracy: 0.4




✅ XGBoost Accuracy: 0.4286
✅ AdaBoost Accuracy: 0.4571
📁 Model results saved to: motif_model_comparison.csv
