In [1]:
import pandas as pd

# Load datasets
X_train = pd.read_csv("X_train.csv")  # Peptide sequences
train_labels = pd.read_csv("label_train.csv")  # Corresponding labels

# Combine features and labels
train_data = pd.concat([X_train, train_labels], axis=1)
train_data.columns = ['peptide_sequence', 'label']

# Load test data
X_test = pd.read_csv("X_test.csv")
test_labels = pd.read_csv("label_test.csv")
test_data = pd.concat([X_test, test_labels], axis=1)
test_data.columns = ['peptide_sequence', 'label']

In [2]:
train_data.head()

Unnamed: 0,peptide_sequence,label
0,DDRHKIVNVDQRQYG,1
1,EGNRPTNSIVFTKLT,1
2,TRQGGYSNDNTVIFR,1
3,LHGETFPYTAFDNNC,1
4,VMALEPVVGAAIAAP,1


In [3]:
test_data.head()

Unnamed: 0,peptide_sequence,label
0,TPETLFEIGSVSKTFTAT,1
1,HPGNTILHVDTIYNRPSNTT,1
2,YWAGIEFDVTHKGMALLHRL,1
3,EQGLLYMPQELAVSD,1
4,GARGFFQARHLEMDA,1


In [4]:
len(train_data)

2872

In [5]:
len(test_data)

342

# TF-IDF Vectorization

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))

In [19]:
X_train_tfidf = vectorizer.fit_transform(train_data['peptide_sequence'])
X_test_tfidf = vectorizer.fit_transform(test_data['peptide_sequence'])

In [20]:
print(f"Train TF-IDF feature matrix shape: {X_train_tfidf.shape}")

TF-IDF feature matrix shape: (2872, 2872)


In [22]:
print(f"Test TF-IDF feature matrix shape: {X_test_tfidf.shape}")

Test TF-IDF feature matrix shape: (342, 342)


In [23]:
import joblib

In [24]:
joblib.dump(X_train_tfidf, "X_train_tfidf.pkl")

['X_train_tfidf.pkl']

In [25]:
joblib.dump(X_test_tfidf, "X_test_tfidf.pkl")

['X_test_tfidf.pkl']