In [5]:
# Step 1: Install required libraries
!pip install biopython scikit-learn

# Step 2: Import required libraries
from google.colab import drive
from Bio import SeqIO
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import hamming_loss, accuracy_score

# Step 3: Mount Google Drive to access files
drive.mount('/content/drive')

# Step 4: Define file paths
project_folder = '/content/drive/My Drive/BioinformaticsProject'
fasta_file = os.path.join(project_folder, 'YEAST.fasta')
go_terms_file = os.path.join(project_folder, 'AllProteinswithFunctions-Bakers Yeast.txt')

# Step 5: Check if files exist
if not os.path.exists(fasta_file):
    raise FileNotFoundError(f"YEAST.fasta not found at {fasta_file}")
if not os.path.exists(go_terms_file):
    raise FileNotFoundError(f"AllProteinswithFunctions-Bakers Yeast.txt not found at {go_terms_file}")

# Step 6: Parse the YEAST.fasta file to extract sequence identifiers and sequences
sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id  # Extract sequence identifier
    sequence = str(record.seq)  # Extract sequence data
    sequences.append({'sequence_id': sequence_id, 'sequence': sequence})

# Convert to DataFrame for easier processing
fasta_df = pd.DataFrame(sequences)
print("\nParsed YEAST.fasta file:")
print(fasta_df.head())

# Extract protein_id from sequence_id (assume it's the second segment of the FASTA header)
fasta_df['protein_id'] = fasta_df['sequence_id'].str.split('|').str[1]

print("\nUpdated FASTA data with extracted protein_id:")
print(fasta_df.head())

# Step 7: Parse the GO terms file to extract protein IDs and GO terms
go_data = pd.read_csv(go_terms_file, delimiter=";", header=None, dtype=str, on_bad_lines='skip')
go_data.columns = ['protein_id'] + [f"go_term_{i}" for i in range(1, go_data.shape[1])]
go_data['go_terms'] = go_data.iloc[:, 1:].apply(lambda row: ';'.join(row.dropna()), axis=1)
go_data = go_data[['protein_id', 'go_terms']]

print("\nProcessed GO terms data:")
print(go_data.head())

# Step 8: Align sequences with GO terms
aligned_data = pd.merge(fasta_df, go_data, on='protein_id', how='left')
aligned_data['go_terms'] = aligned_data['go_terms'].fillna("No GO terms")

print("\nAligned data (sequences with GO terms):")
print(aligned_data.head())

# Step 9: Extract k-mers from sequences
def generate_kmers(sequence, k=3):
    return ' '.join([sequence[i:i+k] for i in range(len(sequence) - k + 1)])

aligned_data['k_mers'] = aligned_data['sequence'].apply(lambda seq: generate_kmers(seq, k=3))

print("\nExtracted k-mers:")
print(aligned_data[['sequence', 'k_mers']].head())

# Step 10: Convert k-mers into a bag-of-words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(aligned_data['k_mers'])

print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of unique k-mers: {len(vectorizer.vocabulary_)}")

# Step 11: Encode GO terms as binary labels
aligned_data['go_terms_list'] = aligned_data['go_terms'].apply(lambda x: x.split(';') if x != "No GO terms" else [])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(aligned_data['go_terms_list'])

print(f"\nLabel matrix shape: {y.shape}")
print(f"Number of unique GO terms: {len(mlb.classes_)}")

# Step 12: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Step 13: Train a Naive Bayes model
model = OneVsRestClassifier(MultinomialNB())
model.fit(X_train, y_train)
print("\nModel training complete!")

# Step 14: Evaluate the model
y_pred = model.predict(X_test)
hamming = hamming_loss(y_test, y_pred)
subset_acc = accuracy_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Hamming Loss: {hamming}")
print(f"Subset Accuracy: {subset_acc}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Parsed YEAST.fasta file:
             sequence_id                                           sequence
0  sp|A2P2R3|YM084_YEAST  MCGIFGYCNFLIEKTRGEIIDTLIEGLQALEYKEYDSSGISIQGDE...
1  sp|A5Z2X5|YP010_YEAST  MRPAQLLLNTAKKTSGGYKIPVELTPLFLAVGVALCSGTYFTYKKL...
2   sp|D6VPM8|YAJ3_YEAST  MINFLLFVLTILATLTNIWFSGVLSPAMVIRICLGGSMVVLQIWSF...
3   sp|D6VTK4|STE2_YEAST  MSDAAPSLSNLFYDPTYNPGQSTINYTSIYGNGSTITFDELQGLVN...
4   sp|D6W196|CMC1_YEAST  MLLKNCETDKQRDIRYACLFKELDVKGNGQVTLDNLISAFEKNDHP...

Updated FASTA data with extracted protein_id:
             sequence_id                                           sequence  \
0  sp|A2P2R3|YM084_YEAST  MCGIFGYCNFLIEKTRGEIIDTLIEGLQALEYKEYDSSGISIQGDE...   
1  sp|A5Z2X5|YP010_YEAST  MRPAQLLLNTAKKTSGGYKIPVELTPLFLAVGVALCSGTYFTYKKL...   
2   sp|D6VPM8|YAJ3_YEAST  MINFLLFVLTILATLTNIWFSGVLSPAMVIRICLGGSMVVLQIWSF...   
3   sp|D6VTK4|STE2_YEAST  




Model training complete!

Model Evaluation:
Hamming Loss: 0.014022271588061061
Subset Accuracy: 0.7022556390977444
