In [None]:
# ==========================================
# Cell 1: Setup and Installation
# ==========================================

!pip install transformers pandas datasets scikit-learn accelerate
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
# ==========================================
# Cell 2: Mount Google Drive and Setup Paths
# ==========================================

from google.colab import drive
drive.mount('/content/drive')

PROJECT_ROOT = '/content/drive/MyDrive/protein_classification'
DATA_DIR = f'{PROJECT_ROOT}/data'
MODELS_DIR = f'{PROJECT_ROOT}/models'
RESULTS_DIR = f'{PROJECT_ROOT}/results'
OUTPUTS_DIR = f'{PROJECT_ROOT}/outputs'

for dir_path in [PROJECT_ROOT, DATA_DIR, MODELS_DIR, RESULTS_DIR, OUTPUTS_DIR]:
    os.makedirs(dir_path, exist_ok=True)
    print(f"Created: {dir_path}")

os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

Mounted at /content/drive
Created: /content/drive/MyDrive/protein_classification
Created: /content/drive/MyDrive/protein_classification/data
Created: /content/drive/MyDrive/protein_classification/models
Created: /content/drive/MyDrive/protein_classification/results
Created: /content/drive/MyDrive/protein_classification/outputs
Working directory: /content/drive/MyDrive/protein_classification


In [None]:
# ==========================================
# Cell 3: Data Reading Functions
# ==========================================

def read_data(filename, target):
    file_path = f'{DATA_DIR}/{filename}.tsv.gz'

    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found!")
        print(f"Please upload {filename}.tsv.gz to {DATA_DIR}/")
        return None

    print(f"Reading {filename}...")
    df = pd.read_csv(file_path, compression='gzip', sep='\t')

    # Data cleaning
    original_size = len(df)
    df = df.dropna(subset=['Gene Ontology (GO)', 'Sequence'])
    df = df[~df["Sequence"].duplicated()]
    df = df[df['Sequence'].str.len() <= 1800]

    # Prepare for classification
    df.drop(['Entry', 'Gene Ontology (GO)'], axis=1, inplace=True)
    df['target'] = target
    df.reset_index(drop=True, inplace=True)

    print(f"Processed {len(df):,} samples (from {original_size:,})")
    return df

def read_and_prepare_data():
    print("=" * 60)
    print("PROTEIN DATA PREPARATION")
    print("=" * 60)

    # Read positive and negative samples
    dev_df_positive = read_data('Extracellular_matrix_organization', 1)
    dev_df_negative = read_data('Not_extracellular_matrix_organization', 0)

    if dev_df_positive is None or dev_df_negative is None:
        return None

    # Random sampling for balanced dataset
    print(f"\n Sampling 40,000 samples from each class...")
    if len(dev_df_positive) >= 40000:
        dev_df_positive = dev_df_positive.sample(n=40000, random_state=42)
    else:
        print(f"   Note: Only {len(dev_df_positive):,} positive samples available")

    if len(dev_df_negative) >= 40000:
        dev_df_negative = dev_df_negative.sample(n=40000, random_state=42)
    else:
        print(f"   Note: Only {len(dev_df_negative):,} negative samples available")

    # Combine and shuffle
    dev_df = pd.concat([dev_df_positive, dev_df_negative])
    dev_df = dev_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"\n Dataset Summary:")
    print(f"   Total samples: {len(dev_df):,}")
    print(f"   Positive (ECM): {sum(dev_df['target']):,}")
    print(f"   Negative (Non-ECM): {len(dev_df) - sum(dev_df['target']):,}")

    return dev_df

In [None]:
# ==========================================
# Cell 4: Data Splitting
# ==========================================

def split_data(df, test_size=0.2, val_size=0.2):
    """Split data into train/validation/test sets"""

    print("\n Splitting data...")
    sequences = df["Sequence"].tolist()
    labels = df["target"].tolist()

    # First split: train+val vs test
    train_val_seq, test_seq, train_val_labels, test_labels = train_test_split(
        sequences, labels, test_size=test_size, random_state=42, stratify=labels
    )

    # Second split: train vs val
    train_seq, val_seq, train_labels, val_labels = train_test_split(
        train_val_seq, train_val_labels,
        test_size=val_size/(1-test_size), random_state=42, stratify=train_val_labels
    )

    print(f"Train samples: {len(train_seq):,}")
    print(f"Validation samples: {len(val_seq):,}")
    print(f"Test samples: {len(test_seq):,}")

    return {
        'train_seq': train_seq, 'train_labels': train_labels,
        'val_seq': val_seq, 'val_labels': val_labels,
        'test_seq': test_seq, 'test_labels': test_labels
    }

In [None]:
# ==========================================
# Cell 5: Execute Data Preparation
# ==========================================

# Prepare the dataset
df = read_and_prepare_data()

if df is not None:
    # Split the data
    data_splits = split_data(df)

    # Save data splits
    splits_path = f'{RESULTS_DIR}/data_splits.pkl'
    with open(splits_path, 'wb') as f:
        pickle.dump(data_splits, f)

    print(f"\n Data splits saved to: {splits_path}")

    # Display sample data
    print(f"\n Sample Data:")
    print(df.head())

    # Show sequence length distribution
    seq_lengths = df['Sequence'].str.len()
    print(f"\n Sequence Length Statistics:")
    print(f"   Mean: {seq_lengths.mean():.1f}")
    print(f"   Median: {seq_lengths.median():.1f}")
    print(f"   Min: {seq_lengths.min()}")
    print(f"   Max: {seq_lengths.max()}")

    print(f"\n Data preparation completed successfully!")
    print(f" All files saved in: {PROJECT_ROOT}")
    print(f"\n  Next: Run 02_esm2_training.ipynb")

else:
    print(" Data preparation failed. Please check your data files.")


PROTEIN DATA PREPARATION
Reading Extracellular_matrix_organization...
Processed 40,155 samples (from 44,639)
Reading Not_extracellular_matrix_organization...
Processed 46,427 samples (from 51,705)

 Sampling 40,000 samples from each class...

 Dataset Summary:
   Total samples: 80,000
   Positive (ECM): 40,000
   Negative (Non-ECM): 40,000

 Splitting data...
Train samples: 48,000
Validation samples: 16,000
Test samples: 16,000

 Data splits saved to: /content/drive/MyDrive/protein_classification/results/data_splits.pkl

 Sample Data:
                                            Sequence  target
0  MGFLSPMHPCRPPTQRRMAAGNHSTVTEFILKGLTKRADLQLPLFL...       0
1  MKPSIYSLTRDELIAWAVERGQKQFRATQIWDWLYKKRVQSFEEMT...       0
2  MLRRKPSNASEKEPTQKKKLSLQRSSSFKDFAKSKPSSPVVSEKEF...       0
3  MSPWIKHICLVLVAAFMLVKTTESKKDEALYCSACMAIADEINYSI...       0
4  MIPRVLILLTLVALFCACSTLAAVAHIEVDCIPPFTVYLLYGFVTL...       0

 Sequence Length Statistics:
   Mean: 654.5
   Median: 539.0
   Min: 2
   Max: 1800

 Data p