In [35]:
# #installing required libraries

# !pip install biopython
# !pip install scikit-learn
# !pip install xgboost
!pip install tensorflow
#!pip install imbalanced-learn

ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tensorflow


# **Preprocessing**

## **Data Cleaning**

In [36]:
from Bio import SeqIO

#function extracting amino acid sequences
def extract_sequences(f):
    seq = []
    for record in SeqIO.parse(f, "fasta"):
        seq.append(str(record.seq))
    return seq

#Extracting sequences
allergen_sequences = extract_sequences("allergen.fasta")
non_allergen_sequences = extract_sequences("not_allergen.fasta")

print(f"Extracted allergen sequences: {len(allergen_sequences)}")
print(f"Extracted non allergens sequences: {len(non_allergen_sequences)}")

Extracted allergen sequences: 1295
Extracted non allergens sequences: 20390


## **Removing Duplicates**

In [37]:
#removing duplicate sequences
allergen_sequences = list(set(allergen_sequences))
non_allergen_sequences = list(set(non_allergen_sequences))

print(f"Unique allergen sequences: {len(allergen_sequences)}")
print(f"Unique non allergen sequences: {len(non_allergen_sequences)}")

Unique allergen sequences: 1239
Unique non allergen sequences: 20313


## **Filtering Sequences**

In [38]:
#filtering out short sequences from allergen amino acid sequences
allergen_seq = []
for seq in allergen_sequences:
    #if length of sequence is less than 30, filter it out
    if len(seq) >= 30:
        allergen_seq.append(seq)

#filtering out short sequences from non allergen amino acid sequences
non_allergen_seq = []
for seq in non_allergen_sequences:
    #if length of sequence is less than 30, filter it out
    if len(seq) >= 30:
        non_allergen_seq.append(seq)

print(f"Filtered allergen sequences: {len(allergen_seq)}")
print(f"Filtered out non allergen sequences: {len(non_allergen_seq)}")

Filtered allergen sequences: 1183
Filtered out non allergen sequences: 20264


## **Converting Protein Sequences to Features**

Amino Acid Composition (AAC):

In [39]:
import numpy as np

AMINO_ACIDS = "ACDEFGHIJKLMNPQRSTVWY"

def calculate_aac(seq):
    seq_list = []
    for aa in AMINO_ACIDS:
        a = seq.count(aa) / len(seq)
        seq_list.append(a)
    return seq_list

#Convert all features to AAC features vectors
allergen_features = np.array([calculate_aac(seq) for seq in allergen_seq])
non_allergen_features = np.array([calculate_aac(seq) for seq in non_allergen_seq])

print(f"Feature shape (AAC) of allergen features: {allergen_features.shape}")
print(f"Feature shape (AAC) of non allergen sequences: {non_allergen_features.shape}")

Feature shape (AAC) of allergen features: (1183, 21)
Feature shape (AAC) of non allergen sequences: (20264, 21)


In [40]:
# print(f"Number of allergen samples: {len(allergen_features)}")
# print(f"Number of non-allergen samples: {len(non_allergen_features)}")

# # Combine features
# X = np.concatenate((allergen_features, non_allergen_features), axis=0)

# # Create labels: 1 for allergen, 0 for non-allergen
# y = np.concatenate((np.ones(len(allergen_features)), np.zeros(len(non_allergen_features))))

# # Apply SMOTE to balance the classes
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(random_state=42)
# X_bal, y_bal = smote.fit_resample(X, y)

# print("After SMOTE:")
# print(f"Number of samples in class 1 (allergen): {sum(y_bal==1)}")
# print(f"Number of samples in class 0 (non-allergen): {sum(y_bal==0)}")

# # Split balanced data into training and testing sets
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)

Dipeptide Composition (DPC)

In [41]:
from itertools import product

DIPEPTIDES = [''.join(p) for p in product(AMINO_ACIDS, repeat=2)]

def calculate_dpc(seq):
    seq_list = []
    for dp in DIPEPTIDES:
        d = seq.count(dp) / len(seq)
        seq_list.append(d)
    return seq_list

#converting all sequences to DPC feature vectors
allergen_dpc = np.array([calculate_dpc(seq) for seq in allergen_seq])
non_allergen_dpc = np.array([calculate_dpc(seq) for seq in non_allergen_seq])

print(f"Feature shape (DPC) of allergen sequences: {allergen_dpc.shape}")
print(f"Feature shape (DPC) of non allergen sequences: {non_allergen_dpc.shape}")

Feature shape (DPC) of allergen sequences: (1183, 441)
Feature shape (DPC) of non allergen sequences: (20264, 441)


Physiochemical Properties: (charge, polarity, hydrophobicity)

In [42]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

def calculate_physiochemical_features(seq):
    seq = seq.replace("X", "")
    seq = seq.replace("Z", "")
    seq = seq.replace("U", "")
    analysis = ProteinAnalysis(seq)
    return [
        analysis.isoelectric_point(),
        analysis.molecular_weight(),
        analysis.aromaticity(),
        analysis.instability_index(),
        analysis.gravy()
    ]

#convert all sequences to physiochemical features
allergen_physico_features = np.array([calculate_physiochemical_features(seq) for seq in allergen_seq])
non_allergen_physico_features = np.array([calculate_physiochemical_features(seq) for seq in non_allergen_seq])

print(f"Feature shape (Physicochemical) of allergen sequences: {allergen_physico_features.shape}")
print(f"Feature shape (Physicochemical) of non allergen sequences: {non_allergen_physico_features.shape}")

Feature shape (Physicochemical) of allergen sequences: (1183, 5)
Feature shape (Physicochemical) of non allergen sequences: (20264, 5)


## **Adding labels**

In [43]:
from sklearn.utils import shuffle

all_features = np.hstack((allergen_features, allergen_dpc, allergen_physico_features))
non_all_features = np.hstack((non_allergen_features, non_allergen_dpc, non_allergen_physico_features))

x = np.vstack((all_features, non_all_features))
y = np.array([1] * len(all_features) + [0] * len(non_all_features))

x, y = shuffle(x, y, random_state=42)

from imblearn.over_sampling import SMOTE

# Apply SMOTE to the final combined feature matrix and label array
smote = SMOTE(random_state=42)
x_bal, y_bal = smote.fit_resample(x, y)

print("After SMOTE balancing:")
print(f"Samples in class 0 (non-allergen): {sum(y_bal == 0)}")
print(f"Samples in class 1 (allergen): {sum(y_bal == 1)}")


print(f"Shape of x: {x.shape}")
print(f"Shape of y: {y.shape}")

After SMOTE balancing:
Samples in class 0 (non-allergen): 20264
Samples in class 1 (allergen): 20264
Shape of x: (21447, 467)
Shape of y: (21447,)


# **Model Training**

## **Splitting Dataset**

In [44]:
from sklearn.model_selection import train_test_split

#Splitting dataset
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size=0.2, random_state=42,stratify=y_bal)

## **Random Forest**

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

#initializing classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

#training the model
rf_model.fit(x_train, y_train)

# Predictions on training data
y_train_pred = rf_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

#making predictions
y_pred = rf_model.predict(x_test)

#evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Model Test Accuracy: {accuracy:.4f}")

#Classification report
print(classification_report(y_test, y_pred))

Training Accuracy: 1.0000
Random Forest Model Test Accuracy: 0.9951
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4053
           1       1.00      0.99      1.00      4053

    accuracy                           1.00      8106
   macro avg       1.00      1.00      1.00      8106
weighted avg       1.00      1.00      1.00      8106



## **Logistic Regression**

In [46]:
from sklearn.linear_model import LogisticRegression

#initializing classifier
lr_model = LogisticRegression(max_iter=1000, random_state=42)

#training the model
lr_model.fit(x_train, y_train)

# Training accuracy
y_train_pred = lr_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

#making predictions
y_pred = lr_model.predict(x_test)

#evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Model Accuracy: {accuracy}")

#classification report
print(classification_report(y_test, y_pred))

Training Accuracy: 0.8591
Logistic Regression Model Accuracy: 0.8615840118430792
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4053
           1       0.86      0.86      0.86      4053

    accuracy                           0.86      8106
   macro avg       0.86      0.86      0.86      8106
weighted avg       0.86      0.86      0.86      8106



## **Support Vector Model (SVM)**

In [47]:
# from sklearn.svm import SVC

# #initializing classifier
# svm_model = SVC(kernel='linear', probability=True, random_state=42)

# #training the model
# svm_model.fit(x_train, y_train)

# #making predictions
# y_pred = svm_model.predict(x_test)

# #evaluating the model
# accuracy = accuracy_score(y_test, y_pred)
# print(f"SVM Model Accuracy: {accuracy}")

# #classification report
# print(classification_report(y_test, y_pred))

## **XGBoost (Boosting Model)**

In [48]:
import xgboost as xgb

#initializing classifier
xgb_model = xgb.XGBClassifier(eval_metric='logloss')

#training the model
xgb_model.fit(x_train, y_train)

#making predictions
y_pred = xgb_model.predict(x_test)

# Training accuracy
y_train_pred = xgb_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

#evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"XGB Model Accuracy: {accuracy}")

#classification report
print(classification_report(y_test, y_pred))

Training Accuracy: 1.0000
XGB Model Accuracy: 0.99506538366642
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4053
           1       1.00      0.99      1.00      4053

    accuracy                           1.00      8106
   macro avg       1.00      1.00      1.00      8106
weighted avg       1.00      1.00      1.00      8106



## **Artificial Neural Network (Deep Learning)**

In [49]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# create model
ann_model = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    max_iter=20,
    batch_size=32,
    random_state=42
)

# train model
ann_model.fit(x_train, y_train)

# Training accuracy
y_train_pred = ann_model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# predict
y_pred = ann_model.predict(x_test)

# evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"ANN Model Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))




Training Accuracy: 0.7432
ANN Model Accuracy: 0.7479644707623982
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      4053
           1       0.72      0.82      0.76      4053

    accuracy                           0.75      8106
   macro avg       0.75      0.75      0.75      8106
weighted avg       0.75      0.75      0.75      8106



# **Saving in .pkl file**

In [51]:
import joblib

joblib.dump(rf_model, "allergen_detection.pkl")

['allergen_detection.pkl']