In [3]:
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans
import numpy as np

# --- 1. Load the Processed Data ---
print("Loading processed data...")
data_path = Path('../data/processed/protein_structures.parquet')

if not data_path.exists():
    print(f"❌ ERROR: File not found at {data_path}")
    print("Please run 'python run.py' in your terminal first.")
else:
    df = pd.read_parquet(data_path)
    print(f"✅ Full dataset with {len(df)} atoms loaded.")

    # --- 2. Isolate the Protein Backbone ('CA' atoms) ---
    backbone_df = df[df['atom_name'] == 'CA'].copy()
    print(f"✅ Backbone DataFrame created with {len(backbone_df)} atoms.")

    # --- 3. Create Location Tokens using K-Means ---
    # The number of clusters must be less than the number of samples (atoms)
    if len(backbone_df) > 0:
        num_location_tokens = 512 # You can adjust this value
        if len(backbone_df) < num_location_tokens:
            num_location_tokens = len(backbone_df) # Prevent error if samples < clusters

        print(f"Creating {num_location_tokens} location tokens...")
        
        coords = backbone_df[['x_coord', 'y_coord', 'z_coord']].values
        kmeans = KMeans(n_clusters=num_location_tokens, random_state=42, n_init='auto')
        kmeans.fit(coords)
        
        # Add the tokens to our DataFrame
        backbone_df['location_token'] = kmeans.predict(coords)
        
        print("✅ Tokenization complete.")
        print("\n--- Final Prepared Data ---")
        display(backbone_df.head())
    else:
        print("❌ Backbone DataFrame is empty. Cannot proceed with tokenization.")

Loading processed data...
✅ Full dataset with 11552 atoms loaded.
✅ Backbone DataFrame created with 760 atoms.
Creating 512 location tokens...
✅ Tokenization complete.

--- Final Prepared Data ---


Unnamed: 0,pdb_id,chain_id,residue_name,residue_seq_id,atom_name,x_coord,y_coord,z_coord,bfactor,location_token
1,1l2y,A,ASN,1,CA,-8.608,3.135,-1.618,0.0,161
17,1l2y,A,LEU,2,CA,-4.923,4.002,-2.452,0.0,329
36,1l2y,A,TYR,3,CA,-3.69,2.738,0.981,0.0,266
57,1l2y,A,ILE,4,CA,-5.857,-0.449,0.613,0.0,227
76,1l2y,A,GLN,5,CA,-4.122,-1.167,-2.743,0.0,345


In [4]:
import numpy as np
SEQUENCE_LENGTH = 128 
sequences = backbone_df.groupby('pdb_id')['location_token'].apply(list).tolist()

print(f"Created {len(sequences)} individual protein token sequences.")
X = [] 
y = [] 

for seq in sequences:
    
    if len(seq) > SEQUENCE_LENGTH:
        
        for i in range(len(seq) - SEQUENCE_LENGTH):
            X.append(seq[i:i + SEQUENCE_LENGTH])
            y.append(seq[i + 1:i + SEQUENCE_LENGTH + 1])


X = np.array(X)
y = np.array(y)

print("\n✅ Training data created successfully.")
print(f"Shape of our input data (X): {X.shape}")
print(f"Shape of our target data (y): {y.shape}")

Created 1 individual protein token sequences.

✅ Training data created successfully.
Shape of our input data (X): (632, 128)
Shape of our target data (y): (632, 128)


In [None]:
%pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


class ProteusTransformer(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, nhead: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, src: torch.Tensor) -> torch.Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.linear(output)
        return output


VOCAB_SIZE = 512    
D_MODEL = 256       
N_HEAD = 8          
D_HID = 512         
N_LAYERS = 4        
DROPOUT = 0.1       


model = ProteusTransformer(VOCAB_SIZE, D_MODEL, N_HEAD, D_HID, N_LAYERS, DROPOUT)

print("✅ ProteusTransformer model created successfully!")
print("\n--- Model Architecture ---")
print(model)

INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp312-cp312-win_amd64.whl.metadata (6.7 kB)
  Downloading torchaudio-2.6.0-cp312-cp312-win_amd64.whl.metadata (6.7 kB)
  Downloading torchaudio-2.5.1-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
  Downloading torchaudio-2.5.0-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
  Downloading torchaudio-2.4.1-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
  Downloading torchaudio-2.4.0-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
  Downloading torchaudio-2.3.1-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
INFO: pip is still looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
  Downloading torchaudio-2.3.0-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
  Downloading torchaudio-2.2.2-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Dow

In [6]:
from torch.utils.data import TensorDataset, DataLoader


BATCH_SIZE = 32


X_tensor = torch.from_numpy(X).long()
y_tensor = torch.from_numpy(y).long()


train_data = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)


criterion = nn.CrossEntropyLoss()


optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


EPOCHS = 10 #

print("--- Starting Model Training ---")

model.train() 

for epoch in range(EPOCHS):
    epoch_loss = 0
    for i, (data, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        
        output = model(data)
        

        loss = criterion(output.view(-1, VOCAB_SIZE), targets.view(-1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Average Loss: {avg_loss:.4f}")

print("\n✅ Training complete!")

--- Starting Model Training ---
Epoch 1/10 | Average Loss: 3.5308
Epoch 2/10 | Average Loss: 0.6340
Epoch 3/10 | Average Loss: 0.2273
Epoch 4/10 | Average Loss: 0.1661
Epoch 5/10 | Average Loss: 0.1475
Epoch 6/10 | Average Loss: 0.1383
Epoch 7/10 | Average Loss: 0.1321
Epoch 8/10 | Average Loss: 0.1294
Epoch 9/10 | Average Loss: 0.1252
Epoch 10/10 | Average Loss: 0.1242

✅ Training complete!


In [7]:
import torch
import numpy as np


model.eval() 

input_sequence = torch.from_numpy(X[0, :1]).long().unsqueeze(0)


generated_tokens = input_sequence[0].tolist()


GENERATION_LENGTH = 200
TEMPERATURE = 0.8 

print("🧬 Generating new protein sequence...")

with torch.no_grad(): 
    for _ in range(GENERATION_LENGTH):
        
        output = model(input_sequence)
        
        
        last_token_logits = output[0, -1, :]
        
       
        scaled_logits = last_token_logits / TEMPERATURE
        probabilities = torch.nn.functional.softmax(scaled_logits, dim=0)
        
        
        next_token = torch.multinomial(probabilities, 1)
        
        
        generated_tokens.append(next_token.item())
        
        
        input_sequence = torch.tensor([generated_tokens]).long()

print("✅ New protein token sequence generated successfully!")
print(f"\n--- Generated Token Sequence (first 15 tokens) ---\n{generated_tokens[:15]}")


generated_coords = kmeans.cluster_centers_[generated_tokens]


output_protein_path = "../results/generated_proteins/novel_protein_1.xyz"

with open(output_protein_path, "w") as f:
    f.write(f"{len(generated_coords)}\n")
    f.write("Generated by Proteus AI\n") 
    for coord in generated_coords:
       
        f.write(f"C {coord[0]:.3f} {coord[1]:.3f} {coord[2]:.3f}\n")

print(f"\n⭐️ Your new protein has been saved to: {output_protein_path}")

🧬 Generating new protein sequence...
✅ New protein token sequence generated successfully!

--- Generated Token Sequence (first 15 tokens) ---
[161, 186, 88, 295, 397, 71, 134, 342, 106, 139, 382, 99, 109, 428, 479]

⭐️ Your new protein has been saved to: ../results/generated_proteins/novel_protein_1.xyz
