In [1]:
import requests
url = 'https://services.healthtech.dtu.dk/services/BepiPred-3.0/Data/IEDB/IEDBSeqsNotSharedAt20ID.fasta'
response = requests.get(url)
with open('../data/epipred_data.fasta', 'wb') as file:
    file.write(response.content)
print('Download complete!')

Download complete!


In [2]:
from Bio import SeqIO
fasta_file = '../data/epipred_data.fasta'
sequences = []
for record in SeqIO.parse(fasta_file, 'fasta'):
    sequences.append(str(record.seq))

print(sequences[:5])


['mgkkqhQKDKLYLTTTEWKETYGGHKDDTGRrmqralfkrlpITHCSLSLLPFEDPVcsqdgiifdltqiipYLKKYGVNPVTGKKMAAKELIHLKFDKDSDGTDYLLSIGnfrcpvtfrvftptshivticQTGNVYSLEAIEELNLKPGHLKDlLTDEPFQRKDIIVLQDPNHLEKFNIEQFHHikldlktkseieaekkamKDPKFYIRRMNNETKEILKKLEKEYIPTKIEQIEEETVDELNAAHYSQGRvaagLTSTTMEPITHQKAAILDADTVKYARVNKNgyvriltnygainlelfckdapraCENFIKHCKNGYYNktkfhriirnFILQGGDPTGTGKGGDSIWGKPFKDEIISSLSHDQRgilsmaNQGTDTNKSQFfitfrscsyldgkHTIFGRVVGGTETLNAIEKIETDESSRPIASFQfpidviflnaeifvdPFEEAEIAVEKERENIRLAKTNQESETVVSApavitQIPKPKKYGSGVGKYINLPEVTAATKRTANDIAEFGTVKKTVQSNRIFGDFSSW', 'msvtLQTTLGDIKIELYCDLCPKTCENFLALcasgyydncIFHRNIKDFMVQTGDPTGTGKGGDSIWGGPIEDELNAALKHDARGVVSMAGNGPNtsrsqffityakhptldLKYTVFGRVIDGFDVLDELEKvkvDAKYRPVVEQKIQNviihanpiadmvtss', 'mswlwdwvsgmlnylgltkkngklvflgldnagkttllhmlkDDRMAQHVPTLHPTSEELSLGgirFTTFDLGGHEQARrvwkdyfpavdAIVFLVDCADVERIAESRHEleslLGDEQVASCPLLILGNKIDKPNALGEDQLKWHlgvtnlttgkGQISRMDISSRPMEVFMCSVLrrqgygegfrwlsqyld', 'mveqpqvrrfsqmfmerlRVFEDHPTVIHGEAQksviqsitpsyksiaekqvqkldfgsdmnlpeavgdykvketqdhfklkiidpk

In [3]:
# Initialize empty lists to hold sequences and labels
epitope_sequences = []
non_epitope_sequences = []

# Iterate through the sequences from the FASTA file
for seq in sequences:
    # Check if the sequence is in uppercase (epitope region)
    if seq.isupper():
        epitope_sequences.append(seq)
    else:
        non_epitope_sequences.append(seq)

# Now, let's create a dataset with labels
data = []
labels = []

# Add epitope sequences with label 1
for epitope in epitope_sequences:
    data.append(epitope)
    labels.append(1)

# Add non-epitope sequences with label 0
for non_epitope in non_epitope_sequences:
    data.append(non_epitope)
    labels.append(0)




In [4]:
print(epitope_sequences[:5])
print(non_epitope_sequences[:5])

['MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA', 'MDVFMKGLSMAKEGVVAAAEKTKQGVTEAAEKTKEGVLYVGSKTSGVVQGVASVAEKTKEQASHLGGAVFSGAGNIAAATGLVKKEEFPTDLKPEEVAQEAAEEPLIEPLMEPEGESYEDSPQEEYQEYEPEA', 'MAEDEDNQQGQGEGLKYLGFVQDAATYAVTTFSNVYLFAKDKSGPLQPGVDIIEGPVKNVAVPLYNRFSYIPNGALKFVDSTVVASVTIIDRSLPPIVKDASIQVVSAIRAAPEAARSLASSLPGQTKILAKVFYGEN', 'MQSEFFICVTFFFVLLHYISCNKPTRNISVKSNKDKDELNNIKEKLDLINNSIKDKVIENFKEDIELLKKKVDDLEKRKSDNTLGKRQKKEDDDDEEETDEDDDEDSDEDDEEQEELNVEPKEREDEQEETDDEQKETEDEQKETEDEQKETEDEQKETEDEQKETEDEQKESDDEQKETEDEQKETEDEASEEYSDNEEDDEEDEEDDEEENEENNDNETNEENEDNDENEDNDENEEEIEVTDVEFVGQSTNKNVRNNMIRNSNKDIKSSSQNSSIKAQNSSTKIGNTPTKLSTQNTKSNSTSNQLITQLQSEKSSSKVDNNKNNTNEIKYMDKLCDDVLTELKEKDNVDNNMNHSKYNNLKKEFSTFTMNQNECDLIKKLIITFSQENVEMKRDSIKEIFLKALDDKKYREVFKNFMYGVYSYAKRHNYLDIEKMEKNERAYKKLFENTLNLLDTI', 'MKFLVFLGIITTVAAFHQECSLQSCTQHQPYVVDDPCPIHFYSKWYIRVGARKSAPLIELCVDEAGSKSPIQYIDIGNYTVSCLPFTINCQEPKLGSLVVRCSFYEDFLEYH

In [5]:
import numpy as np

# Define the amino acids and their corresponding indices
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_dict = {aa: idx for idx, aa in enumerate(amino_acids)}

# Function to one-hot encode a sequence
def one_hot_encode_sequence(sequence):
    # Initialize an empty matrix for the sequence
    one_hot_matrix = np.zeros((len(sequence), len(amino_acids)))
    
    for idx, aa in enumerate(sequence):
        if aa in amino_acid_dict:  # Ensure the amino acid is valid
            one_hot_matrix[idx, amino_acid_dict[aa]] = 1
    
    return one_hot_matrix

# Example of one-hot encoding a sequence
example_sequence = 'MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA'
one_hot_encoded = one_hot_encode_sequence(example_sequence)

print("One-hot encoded sequence:\n", one_hot_encoded)


One-hot encoded sequence:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [6]:
# Flatten the one-hot encoded sequence to a 1D vector
def flatten_sequence(one_hot_matrix):
    return one_hot_matrix.flatten()

# Example: Flatten the one-hot encoded sequence
flattened_sequence = flatten_sequence(one_hot_encoded)

print("Flattened one-hot encoded sequence:\n", flattened_sequence)
print("Shape of flattened sequence:", flattened_sequence.shape)


Flattened one-hot encoded sequence:
 [0. 0. 0. ... 0. 0. 0.]
Shape of flattened sequence: (2800,)


In [9]:
import numpy as np

# Function to one-hot encode a sequence
def one_hot_encode_sequence(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    amino_acid_dict = {aa: idx for idx, aa in enumerate(amino_acids)}
    encoded = np.zeros((len(sequence), len(amino_acids)))
    for i, aa in enumerate(sequence):
        if aa in amino_acid_dict:
            encoded[i, amino_acid_dict[aa]] = 1
    return encoded

# Function to pad sequences to a fixed length
def pad_sequence(sequence, max_length):
    padding = np.zeros((max_length - sequence.shape[0], sequence.shape[1]))  # Create padding
    return np.vstack([sequence, padding])  # Add padding at the bottom

# Function to flatten the encoded sequence
def flatten_sequence(encoded_sequence):
    return encoded_sequence.flatten()

# Set the max sequence length based on the longest sequence in the dataset
max_seq_length = max(len(seq) for seq in epitope_sequences + non_epitope_sequences)

# Create feature matrix and labels (1 for epitope, 0 for non-epitope)
X = []
y = []

# Process epitope sequences
for seq in epitope_sequences:
    one_hot_encoded_seq = one_hot_encode_sequence(seq)
    padded_seq = pad_sequence(one_hot_encoded_seq, max_seq_length)
    flattened_seq = flatten_sequence(padded_seq)
    X.append(flattened_seq)
    y.append(1)  # Epitope label

# Process non-epitope sequences
for seq in non_epitope_sequences:
    one_hot_encoded_seq = one_hot_encode_sequence(seq)
    padded_seq = pad_sequence(one_hot_encoded_seq, max_seq_length)
    flattened_seq = flatten_sequence(padded_seq)
    X.append(flattened_seq)
    y.append(0)  # Non-epitope label

# Convert X and y to numpy arrays for machine learning
X = np.array(X)
y = np.array(y)

print("Feature matrix (X):\n", X)
print("Labels (y):\n", y)
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Feature matrix (X):
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Labels (y):
 [1 1 0 0]
Shape of X: (4, 10800)
Shape of y: (4,)


In [11]:
# Save X (features) and y (labels) as numpy files
np.save('../data/X_featurized.npy', X)
np.save('../data/y_labels.npy', y)


In [12]:
# Load the saved files when needed
X = np.load('../data/X_featurized.npy')
y = np.load('../data/y_labels.npy')

print("Feature matrix (X):\n", X)
print("Labels (y):\n", y)
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Feature matrix (X):
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Labels (y):
 [1 1 0 0]
Shape of X: (4, 10800)
Shape of y: (4,)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.0
