In [None]:
!pip install wandb -q

In [1]:
import wandb
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mankitt6174[0m ([33mankit_6174[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [21]:
# Define your hyperparameters
hyperparameters = {
    "learning_rate": 0.0001,
    "batch_size": 128,
    "embed_dim": 128,
    "num_heads": 4,
    "num_layers": 2,
    "dropout": 0.3,
    "ff_dim": 1024,
    "dataset_size": "50K",
    "epochs": 50,
    "name": "Third_Run",
    "Precision": "FP16"
}

wandb.init(
    project="dna-mutation-predictor-50K",
    config=hyperparameters,
    name=hyperparameters['name']
)

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train_acc,▁▁▁▂▂▂▂▃▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▆▇▇▇▇██
train_loss,██▇▇▇▇▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁
val_acc,▃▃▄▅▅▄▆▆▇▆▆▆▇▆▇▇▆▄▇▁▇▅▇▆▇▅▆██▅▇▇▆▆██▇▆
val_loss,█▇▇▆▆█▄▄▃▄▃▃▃▃▂▃▂▃▂▆▃▂▃▄▁▂▁▁▁▂▃▅▃▂▂▂▃▅

0,1
epoch,37.0
train_acc,0.637
train_loss,0.90182
val_acc,0.5865
val_loss,1.0059


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
import pandas as pd

data_path = f"/content/drive/MyDrive/dataset/{hyperparameters['dataset_size']}.csv"

data = pd.read_csv(data_path)
data.shape

(50000, 10)

In [23]:
x = data['sequence']
y = data['label']

In [24]:
def get_codon(seq, k=3):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)]

vocab = {'<PAD>': 0, '<UNK>': 1}

for seq in data['sequence']:
    for codons in get_codon(seq.lower()):
        if codons not in vocab:
            vocab[codons] = len(vocab)
        else:
            continue

def get_tensor(text):
    return [vocab.get(codons.lower(), vocab['<UNK>']) for codons in get_codon(text)]

In [25]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split

class CustomDataset(Dataset):
  def __init__(self, x, y):
    self.x_frame = x
    self.y_frame = y

  def __len__(self):
    return len(self.x_frame)

  def __getitem__(self, index):
    x = torch.tensor(get_tensor(self.x_frame[index]), dtype=torch.long)
    y = torch.tensor(self.y_frame[index], dtype=torch.float32)
    return x, y

In [26]:
dataset = CustomDataset(x, y)

In [27]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [28]:
train_loader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=hyperparameters['batch_size'])

In [29]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()

        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1)

        div_term = torch.exp((torch.arange(0, embed_dim, 2)) * (-math.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x

class Transformer(nn.Module):
    def __init__(self, embed_dim=512, num_heads=8, num_layers=6, ff_dim=2048, dropout=0.1, vocab_size=10000, max_len=5000):
        super(Transformer, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_encoding = PositionalEncoding(embed_dim=embed_dim, max_len=max_len)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        self.y_labels_out = nn.Linear(embed_dim, 5)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.position_encoding(x)

        x = self.encoder(x)
        x = x.mean(dim=1)

        y_label_out = self.y_labels_out(x)
        return y_label_out

In [30]:
model = Transformer(
    embed_dim=hyperparameters['embed_dim'],
    num_heads=hyperparameters['num_heads'],
    num_layers=hyperparameters['num_layers'],
    ff_dim=hyperparameters['ff_dim'],
    dropout=hyperparameters['dropout'],
    vocab_size=len(vocab),
    max_len=200
)

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Transformer(
  (embeddings): Embedding(66, 128)
  (position_encoding): PositionalEncoding()
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=1024, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=1024, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
    )
  )
  (y_labels_out): Linear(in_features=128, out_features=5, bias=True)
)

In [32]:
device

device(type='cuda')

In [33]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {num_params}")

Total trainable parameters: 668805


In [40]:
ce = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=hyperparameters['learning_rate'], weight_decay=1e-4)

scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [45]:
def train32(model, loader, ce, optimizer):
    model.train()

    running_loss, correct, total = 0.0, 0, 0

    for x, y in loader:
        optimizer.zero_grad()

        x = x.to(device)
        y = y.to(device).long()

        output = model(x)
        loss = ce(output, y)

        prediction = torch.argmax(output, dim=1)
        correct += (prediction == y).sum().item()
        total += len(x)

        loss.backward()

        optimizer.step()

        running_loss += loss.item() * len(x)

    accuracy = correct / total

    return (
        running_loss / len(loader.dataset),
        accuracy
    )

def train16(model, loader, ce, optimizer, scaler):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for x, y in loader:
        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device).long()

        with torch.cuda.amp.autocast(dtype=torch.float16):
            output = model(x)
            loss = ce(output, y)

        prediction = torch.argmax(output, dim=1)
        correct += (prediction == y).sum().item()
        total += len(x)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * len(x)

    accuracy = correct / total
    return (
        running_loss / len(loader.dataset),
        accuracy
    )

In [46]:
def validation(model, loader, ce):
    model.eval()

    running_loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device).long()

            with torch.cuda.amp.autocast(dtype=torch.float16):
                output = model(x)
                loss = ce(output, y)

            running_loss += loss.item() * len(x)

            prediction = torch.argmax(output, dim=1)
            correct += (prediction == y).sum().item()

            total += len(x)

    accuracy = correct / total

    return (
        running_loss / len(loader.dataset),
        accuracy
    )

In [47]:
import os

patience = 10
best_val_loss = float('inf')
counter = 0
early_stop = False

training_loss_arr = []
validation_loss_arr = []
accuracy_arr = []

save_dir = f"/content/drive/MyDrive/dna-mulation-{hyperparameters['dataset_size']}"
os.makedirs(save_dir, exist_ok=True)

for epoch in range(hyperparameters['epochs']):
    if hyperparameters['Precision'] == 'FP32':
        train_loss, train_acc = train32(
            model,
            train_loader,
            ce,
            optimizer
        )
    else:
        train_loss, train_acc = train16(
            model,
            train_loader,
            ce,
            optimizer,
            scaler
        )

    val_loss, val_acc = validation(
        model,
        test_loader,
        ce
    )

    print(f"Epoch ({epoch+1}/{50}): Train Loss = {train_loss:.4f}, Valitation Loss = {val_loss:.4f}, Train_acc = {train_acc:.4f}, Val_acc = {val_acc:.4f}")

    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "train_acc": train_acc,
        "val_acc": val_acc
    })

    if (epoch+1) % 10 == 0:
      checkpoint_path = f"{save_dir}/model_{hyperparameters['name']}_epoch_{epoch+1}.pth"
      torch.save({
          'epoch': epoch+1,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'train_losses': train_loss,
          'val_losses': val_loss
      }, checkpoint_path)
      print(f"Model saved at {checkpoint_path}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        continue
    else:
        counter += 1
        print(f"No improvement in val loss. Counter = {counter}/{patience}")
        if counter >= patience:
            print("Early stopping triggered!")
            early_stop = True
            break

  with torch.cuda.amp.autocast(dtype=torch.float16):
  with torch.cuda.amp.autocast(dtype=torch.float16):


Epoch (1/50): Train Loss = 1.0305, Valitation Loss = 1.0313, Train_acc = 0.5740, Val_acc = 0.5812
Epoch (2/50): Train Loss = 1.0208, Valitation Loss = 1.0325, Train_acc = 0.5770, Val_acc = 0.5812
No improvement in val loss. Counter = 1/10
Epoch (3/50): Train Loss = 1.0170, Valitation Loss = 1.0325, Train_acc = 0.5768, Val_acc = 0.5812
No improvement in val loss. Counter = 2/10
Epoch (4/50): Train Loss = 1.0145, Valitation Loss = 1.0242, Train_acc = 0.5773, Val_acc = 0.5823
Epoch (5/50): Train Loss = 1.0107, Valitation Loss = 1.0297, Train_acc = 0.5791, Val_acc = 0.5847
No improvement in val loss. Counter = 1/10
Epoch (6/50): Train Loss = 1.0092, Valitation Loss = 1.0236, Train_acc = 0.5800, Val_acc = 0.5845
Epoch (7/50): Train Loss = 1.0061, Valitation Loss = 1.0192, Train_acc = 0.5817, Val_acc = 0.5853
Epoch (8/50): Train Loss = 1.0042, Valitation Loss = 1.0201, Train_acc = 0.5844, Val_acc = 0.5844
No improvement in val loss. Counter = 1/10
Epoch (9/50): Train Loss = 1.0027, Valitatio

In [48]:
from sklearn.metrics import classification_report

def get_predictions_and_labels(model, loader):
    model.eval()
    all_y_true = []

    all_y_pred = []

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            output = model(x)

            _, prediction = torch.max(output, 1)

            all_y_true.extend(y.cpu().numpy())

            all_y_pred.extend(prediction.cpu().numpy())

    return (all_y_true, all_y_pred)

y_true, y_pred = get_predictions_and_labels(model, test_loader)

print("Classification Report for Label:")
print(classification_report(y_true, y_pred))
print("-"*20)

Classification Report for Label:
              precision    recall  f1-score   support

         0.0       0.57      0.01      0.03       616
         1.0       0.54      0.11      0.18      3081
         2.0       0.60      0.97      0.74      5812
         3.0       0.53      0.04      0.08       235
         4.0       0.00      0.00      0.00       256

    accuracy                           0.60     10000
   macro avg       0.45      0.23      0.21     10000
weighted avg       0.57      0.60      0.49     10000

--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
