<a href="https://colab.research.google.com/github/Ankur-Samanta/Multimodal_Transformer_CardiacAbnormailityECGClassification/blob/main/MVMTnet_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

MVMTnet: Multi-variate Multi-modal Transformer Network for Multi-class Classification of Cardiac Irregularities Using ECG Waveforms and Clinical Notes

Authors: Ankur Samanta, Meghna Ravikumar, Mark Karlov, Christian Clarke

# Notebook Initialization

In [None]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# Required Dependencies and Imports

In [None]:
# ECG Dependencies
!pip install ecg_plot
!pip install pc
!pip install wfdb

# Bert Dependencies
!pip install transformers

In [None]:
# Required Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

import math
import random
from typing import Tuple
import copy
import ast

import pc
import os

import ecg_plot
import wfdb
from wfdb.io.record import Record, rdrecord
from wfdb.plot.plot import plot_wfdb

from socket import socket

import torch
from torch import nn, Tensor
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets,transforms

from transformers import BertTokenizer, BertModel

# Get the Data

In [None]:
# Unzip the PTB-XL dataset

# Ankur's Path:
!unzip "/content/gdrive/MyDrive/Colab Notebooks/CardiacAbnormalityTransformerProject/physionet.org/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2.zip"
# Meghna's Path:
!unzip "/content/gdrive/MyDrive/Fall 2022/APS360/CardiacAbnormalityTransformerProject/physionet.org/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2.zip"
# Mark's path
!unzip "/content/gdrive/MyDrive/CardiacAbnormalityTransformerProject/physionet.org/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2.zip"

In [None]:
# Unzip the clinical embeddings

# Chris
!unzip "/content/gdrive/MyDrive/CardiacAbnormalityTransformerProject/clinicalEmbeddings.zip"

# Ankur
!unzip "/content/gdrive/MyDrive/Colab Notebooks/CardiacAbnormalityTransformerProject/clinicalEmbeddings.zip"

#Meghna
!unzip "/content/gdrive/MyDrive/Fall 2022/APS360/CardiacAbnormalityTransformerProject/clinicalEmbeddings.zip"

# Data Preprocessing

In [None]:
def load_raw_data(df, sampling_rate, path):
    """
    The first line describes concisely what the functions does. 
    
    Args: 
        argument1 (str): Description of argument1. If you have lots to say about
            a function you can break to a new line and indent the text so it fits. 
        argument2 (int, optional): Description of argument2. 
    
    Returns: 
        str: Optional description explaining the value returned by the function. 
        
    Raises:
        ValueError: Description of any errors or exceptions intentionally raised. 
    
    Notes: 
        Credits: https://physionet.org/content/ptb-xl/1.0.1/example_physionet.py
    """
    if sampling_rate == 100:
        # Loading all data with signal and meta information
        data = [wfdb.rdsamp(path+f)[0] for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    data = np.array(data)
    return data

In [None]:
# Load raw signal data

# Ankur/Mark/Chris's Path
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/CardiacAbnormalityTransformerProject/undersampledY')

# Meghna's path
df = pd.read_csv('/content/gdrive/MyDrive/Fall 2022/APS360/CardiacAbnormalityTransformerProject/undersampledY')

X = load_raw_data(df, 100, '/content/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2/')

(11271, 1000, 12)

In [None]:
# Sanity check: see how many ecg segments have been loaded
X.shape

In [None]:
# note: rename this class to something better 
class M(Dataset):
    def __init__(self, diagnostics_superclass, X):
        """
        Docstring goes here
        """
        self.labels = diagnostics_superclass
        self.data = X
        self.id = df.index.to_list()
        self.notes = df.report.to_list()
        self.map = {'[\'NORM\']':0,'[\'MI\']':1,'[\'STTC\']':2,'[\'CD\']':3,'[\'HYP\']':4, '[\'HYP\', \'MI\']':5, '[\'HYP\', \'CD\']':6,
                    '[\'HYP\', \'STTC\']':7,'[\'MI\', \'CD\']':8,'[\'MI\', \'STTC\']':9,'[\'CD\', \'STTC\']':10,'[\'HYP\', \'MI\', \'STTC\']':11,
                    '[\'HYP\', \'MI\', \'CD\']':12, '[\'MI\', \'CD\', \'STTC\']':13,'[\'HYP\', \'CD\', \'STTC\']':14,'[\'HYP\', \'MI\', \'CD\', \'STTC\']':15, 
                    '[\'CD\', \'NORM\']':16, '[\'CD\', \'NORM\', \'STTC\']':17, '[\'NORM\', \'STTC\']':18, '[\'HYP\', \'NORM\']':19, '[\'MI\', \'NORM\']':20, 
                    '[\'HYP\', \'MI\', \'NORM\']':21, '[\'HYP\', \'CD\', \'NORM\']':22, '[\'HYP\', \'MI\', \'CD\', \'NORM\']':23, '[\'HYP\', \'MI\', \'CD\', \'NORM\', \'STTC\']':24,
                    '[\'HYP\', \'NORM\', \'STTC\']':25, '[\'MI\', \'NORM\', \'STTC\']':26, '[\'HYP\', \'MI\', \'NORM\', \'STTC\']':27, '[\'HYP\', \'CD\', \'NORM\', \'STTC\']':28, '[\'MI\', \'CD\', \'NORM\', \'STTC\']':29}

    def __getitem__(self, i):
        """
        Docstring goes here
        """
        return self.data[i][:250] , self.map["".join(self.labels[i])], torch.load("/content/"+ str(self.id[i]) + ".tensor") 

    def __len__(self):
        """
        Docstring goes here
        """
        return len(self.labels)

In [None]:
def get_data_loader(train_data, val_data, batch_size):
  """
  Docstring goes here
  """
  return torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True), torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)

In [None]:
def get_split_data(dataset):
  """
  Docstring goes here
  """
  train_size = int(0.1 * len(dataset))
  valid_size = int(0.1 * len(dataset))
  test_size = len(dataset) - train_size - valid_size
  train_loader, val_loader, test_loader = torch.utils.data.random_split(dataset, [train_size, valid_size, test_size])
  return train_loader, val_loader, test_loader

In [None]:
# Comment goes here
diagnostics_superclass = df['diagnostic_superclass'].to_list()
diagnostics_superclass = np.array(diagnostics_superclass)
ds = M(diagnostics_superclass, X)

In [None]:
# Get and load data
train_data, val_data, test_data = get_split_data(ds)
train_loader, val_loader = get_data_loader(train_data, val_data, 1)

In [None]:
# Sanity check: see how many samples are in the training set
print(len(train_loader))

# Model Utility Functions

In [None]:
def get_clones(module, N):
    """
    Creates clones of N encoder and decoder layers.
    
    Args: 
        argument1 (str): Description of argument1.
        argument2 (int): Description of argument2. 
    
    Returns: 
        str: Optional description explaining the value returned by the function. 
    
    Notes: 
        Credits: 
    """
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
    """
    query, key, value : batch_size * heads * max_len * d_h
    used in multihead attention class
    
    Args: 
        argument1 (int): Description of argument1.
        argument2 (int): Description of argument2. 
    
    Returns: 
        str: batch_size * heads * max_len * d_h 
    
    Notes: 
        Credits: 
    """
    matmul = torch.matmul(query,key.transpose(-2,-1))
    scale = torch.tensor(query.shape[-1],dtype=float)
    logits = matmul / torch.sqrt(scale)
    if mask is not None:
        logits += (mask.float() * -1e9)
    
    attention_weights = F.softmax(logits,dim = -1)
    output = torch.matmul(attention_weights,value)
    return output

# Transformer Model

This model has eight subparts:
1. Embedder class
2. Positional encoding class
3. Multihead attention class
4. Feed forward neural network 
5. Encoder classes (encoder layer + encoder + encoder block)
6. Decoder classes (decoder layer + decoder)
7. Convolutional neural network
8. Transformer class

Note: Inspiration for certain core transformer implementation segments from "Attention is All You Need" paper are from: 

### Embedder

In [None]:
class Embedder(nn.Module):
    '''Input embedding layer of size vocab_size * dimensionality
    of word embedding'''
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    
    def forward(self,x):
        return self.embed(x)

### Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    '''Transformers are not sequential so positional encoding
    gives some sequentiality to sentence'''

    def __init__(self, d_model, dropout=0.1, max_len=250):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() \
                            * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.pe = pe

    def forward(self, x):
        x *= math.sqrt(self.d_model)
        x +=  self.pe[:,:x.size(1)].cuda()
        return self.dropout(x)

### Multihead Attention

In [None]:
class MultiHeadAttention(nn.Module):
    '''Divides d_model into heads and
    applies attention to each layer with helper 
    function scaled_dot_product_attention'''

    def __init__(self, heads, d_model):
        super().__init__()
        self.heads = heads
        self.d_model = d_model
        print(d_model, self.heads)
        assert d_model % self.heads == 0

        self.d_h = self.d_model // self.heads

        self.q_dense = nn.Linear(d_model,d_model)
        self.k_dense = nn.Linear(d_model,d_model)
        self.v_dense = nn.Linear(d_model,d_model)

        self.out = nn.Linear(d_model,d_model)
    
    def forward(self, q, k, v, mask = None):
        
        # batch_size
        bs = q.size(0)

        k = self.k_dense(k).view(bs, -1, self.heads, self.d_h)
        q = self.q_dense(q).view(bs, -1, self.heads, self.d_h)
        v = self.v_dense(v).view(bs, -1, self.heads, self.d_h)

        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        scores = scaled_dot_product_attention(q,k,v,mask)
        
        # concat each heads
        concat = scores.transpose(1,2).contiguous()\
            .view(bs,-1,self.d_model)
        
        out = self.out(concat)

        return out

### Feed Forward Neural Network

In [None]:
class FeedForward(nn.Module):
    '''Feed Forward neural network'''
    def __init__(self,d_model,d_ff = 1000,dropout = 0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model,d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self,x):
        x = F.relu(self.linear_1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

### Encoder

In [None]:
class EncoderLayer(nn.Module):
    '''Encoder layer of transformer 
    embedding -> positional_encoding -> attention
     -> Feed Forward with skip connection'''
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    
    def forward(self,x):
        x1 = self.norm_1(x)
        x1 = x + self.dropout_1(self.attn(x1,x1,x1))
        x2 = self.norm_2(x1)
        x3 = x1 + self.dropout_2(self.ff(x2))
        return x3

class Encoder(nn.Module):
    '''Cloning and making copies'''
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, src):
        src[src < 0] = 0
        src[src > 249] = 249

        x = self.embed(src)
        x = self.pe(x)

        for i in range(self.N):
            x = self.layers[i](x)
        return self.norm(x) 


class EncoderBlock(nn.Module):
      def __init__(self, vocab_size, d_model, num_layers, heads):
        super().__init__()
        self.encoder = Encoder(vocab_size,d_model,num_layers,heads)
        self.conven = nn.Conv2d(12,1,1)

      def forward(self, src):
          src[src < 0] = 0
          src[src > 249] = 249

          src = torch.reshape(src,(1,250,12))
          src = torch.transpose(src,0,2)

          src = self.conven(src.float())
          src = torch.squeeze(src,2)
          
          e_outputs = self.encoder(src.long())
          return e_outputs # 1x250x120

### Decoder

In [None]:
class DecoderLayer(nn.Module):
    '''Decoder layer - mha to notes embeddings -> add&norm -> mha to both notes embeddings and encoder output -> add&norm -> feed forward linear -> add&norm'''
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.norm_3 = nn.LayerNorm(d_model)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)

    def forward(self, x, encoder_out):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, encoder_out, encoder_out))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x

class Decoder(nn.Module):
    '''Decoder module of transformer, running n sequential decoder segments - takes input of both encoder output and text embeddings'''
    def __init__(self, d_model, vocab_size, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoding(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_out):
        for i in range(self.N):
            x = self.layers[i](x, encoder_out)
        return self.norm(x)

### Convolutional Neural Network

In [None]:
class CNN(nn.Module):
    '''Convolutional output network for classification task - takes in summed input sequence and decoder output'''
    def __init__(self):
        super(CNN, self).__init__()
        self.name = "cnn"
        self.conv1 = nn.Conv2d(1, 5, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(5, 1, 3)
        self.conv3 = nn.Conv2d(1, 1, 3)
        self.fc1 = nn.Linear(377, 128)
        self.fc2 = nn.Linear(128, 16)
        self.fc3 = nn.Linear(16,5)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0),-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = x.squeeze(1) # Flatten to [batch_size]
        return x

### Transformer Model

In [None]:
class Transformer(nn.Module):
    '''Overall Transformer architecture'''
    def __init__(self, vocab_size, d_model, num_layers, heads):
        super().__init__()
        self.name = "Transformer"
        self.encoderBlock = EncoderBlock(vocab_size,d_model,num_layers,heads)
        self.transform = torchvision.transforms.Resize((250,120))
        self.decoder = Decoder(d_model,vocab_size,num_layers,heads)
        self.conv_out = CNN()
        self.fc = nn.Linear(30000, 5)
        self.softmax = nn.Softmax(1)
        self.transform_cnn = torchvision.transforms.Resize((250,120))
        self.norm = nn.LayerNorm((1,250,120))

    def forward(self, src, txt=None):
        src = ((src + 1.68)*100).long() #scale
        
        e_outputs = self.encoderBlock(src.cuda()) #encoder
        
        # text embeddings
        txt = torch.unsqueeze(txt, 0) 
        txt = self.transform(txt)
        
        d_output = self.decoder(txt, e_outputs.cuda()) #decoder
        d_output = torch.squeeze(d_output,0)

        src = self.transform_cnn(src.float())
        return self.conv_out(self.norm(src + d_output)) #output

# Training

### Labels

In [None]:
#{['NORM']:0,['MI']:1,['STTC']:2,['CD']:3,['HYP']:4, ['HYP','MI']:5, ['HYP','CD']:6,
#                  ['HYP','STTC']:7,['MI','CD']:8,['MI','STTC']:9,['CD','STTC']:10,['HYP','MI','STTC']:11,
#                  ['HYP','MI','CD']:12, ['HYP','MI','CD','STTC']:13}
W = np.array([[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1],[0,1,0,0,1],[0,0,0,1,1],[0,0,1,0,1],[0,1,0,1,0],
              [0,1,1,0,0],[0,0,1,1,0],[0,1,1,0,1],[0,1,0,1,1],[0,1,1,1,0],[0,0,1,1,1],[0,1,1,1,1],[1,0,0,1,0],[1,0,1,1,0],[1,0,1,0,0],
              [1,0,0,0,1],[1,1,0,0,0],[1,1,0,0,1],[1,0,0,1,1],[1,1,0,1,1],[1,1,1,1,1],[1,0,1,0,1],[1,1,1,0,0],[1,1,1,0,1],[1,0,1,1,1],[1,1,1,1,0]])

### Evaluation

In [None]:
def evaluate(net, loader, criterion, batch_size):
  accurate = 0
  total = 0

  total_err = 0
  total_loss = 0
  total_epoch = 0
  enum = 1
  for images, labels, note in iter(loader):
    #############################################
    #To Enable GPU Usage
    if use_cuda and torch.cuda.is_available():
      images = images.float().cuda()
      labels = labels.cuda()
      note = note.cuda()
    #############################################
    # images = torch.reshape(images,(1,1000,12))
    # images = torch.transpose(images,0,2)
    #outputs = net(torch.Tensor(images).to(torch.int64), src_mask, note)
    output = net(images,note)

    values, indices = torch.max(output,dim=1)
    if use_cuda and torch.cuda.is_available():
      loss = criterion(output,torch.tensor(W[labels]).float().cuda().unsqueeze(0))
    else:
      loss = criterion(output,torch.tensor(W[labels]).float().unsqueeze(0))

    total_err += int(((W[labels][indices]) == 0))
    total_loss += loss.item()
    total_epoch += len(labels)
    enum = enum + 1
  err = float(total_err) / total_epoch
  loss = float(total_loss) / (enum+1)
  return err, loss

### Training

In [None]:
def train_net(net,train_data,val_data,batch_size=1,learning_rate=5.0,num_epochs=3):
  train_loader, val_loader = get_data_loader(train_data,val_data,batch_size)
  net.train()  # turn on train mode
  total_loss = 0.
  #criterion = nn.CrossEntropyLoss()
  criterion = nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)\

  train_err = np.zeros(num_epochs)
  train_loss = np.zeros(num_epochs)
  val_err = np.zeros(num_epochs)
  val_loss = np.zeros(num_epochs)
  
  # training
  epoch = 0 # the number of iterations
  for epoch in range(0, num_epochs):
    total_train_loss = 0.0
    total_train_err = 0.0
    total_epoch = 0
    enum = 0
    for images, labels, note in iter(train_loader):
      optimizer.zero_grad()
      #############################################
      #To Enable GPU Usage
      if use_cuda and torch.cuda.is_available():
        images = images.float().cuda()
        labels = labels.cuda()
        note = note.cuda()
      #############################################
      
      output = model(images,note)
      # print(output,labels)
      values, indices = torch.max(output,dim=1)
      
      # print(W[labels.cpu()],indices.cpu(),output.cpu(), nn.functional.normalize(torch.sigmoid(output.cpu())))
      if use_cuda and torch.cuda.is_available():
        loss = criterion(output,torch.tensor(W[labels]).float().cuda().unsqueeze(0))
      else:
        loss = criterion(output,torch.tensor(W[labels]).float().unsqueeze(0))
      # print(loss.cpu())
     # if(W[labels.cpu()][indices.cpu()] == 1):
      #  loss = criterion(output, indices)
     # else:
      #  loss = criterion(output, torch.tensor(np.argmax(W[labels])).cuda().unsqueeze(0))
      
      a = list(net.parameters())[0].clone()
  
      loss.backward()
      optimizer.step()

      b = list(net.parameters())[0].clone()
      # if (torch.equal(a.data, b.data)): 
      #   print("weights not updating")

      #print(list(model.parameters())[0].grad)
      #for name, param in net.named_parameters():
        #if param.requires_grad:
          #print(name, param.data)

      #0
      #01111
      #total_train_err = torch.sum((nn.functional.normalize(torch.sigmoid(output.cpu()))) - torch.tensor(W[labels]))
      total_train_err += int(((W[labels][indices]) == 0))
      total_train_loss += loss.item()
      total_epoch += len(labels)
      enum = enum + 1
    train_err[epoch] = float(total_train_err) / total_epoch
    train_loss[epoch] = float(total_train_loss) / (enum+1)
    val_err[epoch], val_loss[epoch] = evaluate(net, val_loader, criterion, batch_size)
    print(("Epoch {}: Train Error: {}, Train loss: {} |"+
               "Validation Error: {}, Validation loss: {}").format(
                   epoch + 1,
                   train_err[epoch],
                   train_loss[epoch],
                   val_err[epoch],
                   val_loss[epoch]))
    if (torch.equal(a.data, b.data)): 
        print("weights not updating")
    model_path = "model_{0}_lr{1}_epoch{2}".format(model.name, learning_rate, epoch)
    torch.save(model.state_dict(), model_path)

  # plotting
  plt.title("Training Curve")
  plt.plot(np.arange(1,num_epochs+1,1), train_loss, label="Train")
  plt.plot(np.arange(1,num_epochs+1,1), val_loss, label="Validation")
  plt.xlabel("Iterations")
  plt.ylabel("Loss")
  plt.legend(loc='best')
  plt.show()

  plt.title("Training Curve")
  plt.plot(np.arange(1,num_epochs+1,1), train_err, label="Train")
  plt.plot(np.arange(1,num_epochs+1,1), val_err, label="Validation")
  plt.xlabel("Iterations")
  plt.ylabel("Training Error")
  plt.legend(loc='best')
  plt.show()

# Execution

### Instantiate the Model

In [None]:
ntokens = 250  # size of vocabulary
emsize = 120  # embedding dimension
d_hid = 120  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 6  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 12  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = Transformer(ntokens, emsize, nlayers, nhead)

### Run the Model

In [None]:
use_cuda = True
if use_cuda and torch.cuda.is_available():
  model.cuda()
  print('CUDA is available!  Training on GPU ...')
else:
  print('CUDA is not available.  Training on CPU ...')

train_net(model,train_data,val_data,batch_size=1,learning_rate=0.0001,num_epochs=20)