In [95]:
import wget, os, gzip, pickle, random, re, sys

IMDB_URL = 'http://dlvu.github.io/data/imdb.{}.pkl.gz'
IMDB_FILE = 'imdb.{}.pkl.gz'

PAD, START, END, UNK = '.pad', '.start', '.end', '.unk'

def load_imdb(final=False, val=5000, seed=0, voc=None, char=False):

    cst = 'char' if char else 'word'

    imdb_url = IMDB_URL.format(cst)
    imdb_file = IMDB_FILE.format(cst)

    if not os.path.exists(imdb_file):
        wget.download(imdb_url)

    with gzip.open(imdb_file) as file:
        sequences, labels, i2w, w2i = pickle.load(file)

    if voc is not None and voc < len(i2w):
        nw_sequences = {}

        i2w = i2w[:voc]
        w2i = {w: i for i, w in enumerate(i2w)}

        mx, unk = voc, w2i['.unk']
        for key, seqs in sequences.items():
            nw_sequences[key] = []
            for seq in seqs:
                seq = [s if s < mx else unk for s in seq]
                nw_sequences[key].append(seq)

        sequences = nw_sequences

    if final:
        return (sequences['train'], labels['train']), (sequences['test'], labels['test']), (i2w, w2i), 2

    # Make a validation split
    random.seed(seed)

    x_train, y_train = [], []
    x_val, y_val = [], []

    val_ind = set( random.sample(range(len(sequences['train'])), k=val) )
    for i, (s, l) in enumerate(zip(sequences['train'], labels['train'])):
        if i in val_ind:
            x_val.append(s)
            y_val.append(l)
        else:
            x_train.append(s)
            y_train.append(l)

    return (x_train, y_train), \
           (x_val, y_val), \
           (i2w, w2i), 2


def gen_sentence(sent, g):

    symb = '_[a-z]*'

    while True:

        match = re.search(symb, sent)
        if match is None:
            return sent

        s = match.span()
        sent = sent[:s[0]] + random.choice(g[sent[s[0]:s[1]]]) + sent[s[1]:]

def gen_dyck(p):
    open = 1
    sent = '('
    while open > 0:
        if random.random() < p:
            sent += '('
            open += 1
        else:
            sent += ')'
            open -= 1

    return sent

def gen_ndfa(p):

    word = random.choice(['abc!', 'uvw!', 'klm!'])

    s = ''
    while True:
        if random.random() < p:
            return 's' + s + 's'
        else:
            s+= word

def load_brackets(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='dyck')

def load_ndfa(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='ndfa')

def load_toy(n=50_000, char=True, seed=0, name='lang'):

    random.seed(0)

    if name == 'lang':
        sent = '_s'

        toy = {
            '_s': ['_s _adv', '_np _vp', '_np _vp _prep _np', '_np _vp ( _prep _np )', '_np _vp _con _s' , '_np _vp ( _con _s )'],
            '_adv': ['briefly', 'quickly', 'impatiently'],
            '_np': ['a _noun', 'the _noun', 'a _adj _noun', 'the _adj _noun'],
            '_prep': ['on', 'with', 'to'],
            '_con' : ['while', 'but'],
            '_noun': ['mouse', 'bunny', 'cat', 'dog', 'man', 'woman', 'person'],
            '_vp': ['walked', 'walks', 'ran', 'runs', 'goes', 'went'],
            '_adj': ['short', 'quick', 'busy', 'nice', 'gorgeous']
        }

        sentences = [ gen_sentence(sent, toy) for _ in range(n)]
        sentences.sort(key=lambda s : len(s))

    elif name == 'dyck':

        sentences = [gen_dyck(7./16.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    elif name == 'ndfa':

        sentences = [gen_ndfa(1./4.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    else:
        raise Exception(name)

    tokens = set()
    for s in sentences:

        if char:
            for c in s:
                tokens.add(c)
        else:
            for w in s.split():
                tokens.add(w)

    i2t = [PAD, START, END, UNK] + list(tokens)
    t2i = {t:i for i, t in enumerate(i2t)}

    sequences = []
    for s in sentences:
        if char:
            tok = list(s)
        else:
            tok = s.split()
        sequences.append([t2i[t] for t in tok])

    return sequences, (i2t, t2i)
(x_train, y_train), (x_val, y_val), (i2w, w2i), numcls = load_imdb(final=False)
# x_train A python list of lists of integers. Each integer represents a word. Sorted from short to long.
# i2w A list of strings mapping the integers in the sequences to their original words.
# w2i A dictionary mapping the words to their indices. w2i['film'] returns the index for the word "film".
# 0 -> positive, 1 -> negative
# print(x_train[141][:5])
# print([i2w[w] for w in x_train[141]])
# print([w2i[i] for i in [i2w[w] for w in x_train[141]]][:5])
# print(y_train[141])
# print(numcls)


### Q1

In [96]:
import torch

def batch_padding(batch):
    padded_batch = []; max_len = max(len(x) for x in batch)+2
    for seq in batch:
        seq = [w2i['.start']] + seq + [w2i['.end']]
        if len(seq) < max_len:
            seq += [w2i['.pad']] * (max_len - len(seq))
        padded_batch.append(seq)
    return torch.tensor(padded_batch, dtype=torch.long)

### Q2

In [97]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNN, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Linear layer with ReLU activation
        self.linear_relu = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()

        # Linear layer for binary classification
        self.linear_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_text):
        # Embedding layer
        embedded = self.embedding(input_text)

        # Linear layer with ReLU activation
        linear_output = self.relu(self.linear_relu(embedded))

        # Global max pooling layer
        pooled_output, _ = torch.max(linear_output, dim=1)

        # Linear layer for binary classification
        output = self.linear_out(pooled_output)

        return output

### Q3

In [88]:
from sklearn.model_selection import ParameterGrid

# Apply GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

torch.manual_seed(0)
vocab_size = len(i2w)
embedding_dim = 300
hidden_size = 300
output_dim = 2
param_grid = {
    'batch_size': [50, 100, 200],
    'epochs': [3, 5, 7],
    'learning_rate': [0.001, 0.005, 0.01]
}

# Create parameter grid
grid = ParameterGrid(param_grid)
results = []

for params in grid:
    batch_size = params['batch_size']
    epochs = params['epochs']
    learning_rate = params['learning_rate']
# Instantiate the model and move it to the device
    model = SimpleRNN(vocab_size, embedding_dim, hidden_size, output_dim).to(device)

    criterion = nn.CrossEntropyLoss() # combines nn.LogSoftmax() and nn.NLLLoss() in one.
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        
        # create batch generators in steps of batch_size-1
        x_train_gen = (x_train[i-batch_size:i] for i in range(batch_size, len(x_train), batch_size-1))
        y_train_gen = (y_train[i-batch_size:i] for i in range(batch_size, len(y_train), batch_size-1))
        
        x_val_gen = (x_val[i-batch_size:i] for i in range(batch_size, len(x_val), batch_size-1))
        y_val_gen = (y_val[i-batch_size:i] for i in range(batch_size, len(y_val), batch_size-1))
        
        # Loop over batches
        for step in range(int(len(x_train)/batch_size)):
            # Extract batch and perform padding
            x_train_batch, y_train_batch = (batch_padding(next(x_train_gen)),
                                            torch.tensor(next(y_train_gen), dtype=torch.long))
            
            # Move the data to the device
            x_train_batch = x_train_batch.to(device)
            y_train_batch = y_train_batch.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(x_train_batch)

            # Calculate the loss
            loss = criterion(outputs.squeeze(), y_train_batch) # squeeze() removes the extra dimension

            # Backward pass
            loss.backward()

            # Update weights
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / (len(x_train) / batch_size)
        #print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")
        
        # Model Validation
        model.eval()
        val_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        # Turn of gradient for faster computation
        with torch.no_grad():
            for val_step in range(int(len(x_val)/batch_size)):
                # Extract validation batches
                x_val_batch, y_val_batch = (batch_padding(next(x_val_gen)),
                                            torch.tensor(next(y_val_gen), dtype=torch.long))

                # Move the data to the device
                x_val_batch = x_val_batch.to(device)
                y_val_batch = y_val_batch.to(device)

                # Forward pass and compute loss
                outputs = model(x_val_batch)
                loss = criterion(outputs.squeeze(), y_val_batch)
                val_loss += loss.item()

                # Compare predictions with ground-truth
                predictions = torch.argmax(torch.softmax(outputs,1),dim=1)
                correct_predictions += (predictions == y_val_batch).sum().item()
                total_samples += y_val_batch.size(0)

        average_val_loss = val_loss / (len(x_val) / batch_size)
        val_accuracy = correct_predictions / total_samples
        #print(f"Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    results.append({
        'batch_size': batch_size,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'average_val_loss': average_val_loss,
        'val_accuracy': val_accuracy
    })
    #print(f"Batch size: {batch_size}, Epochs: {epochs}, Learning rate: {learning_rate}, Average_val_loss: {average_val_loss}, Validation Accuracy: {val_accuracy:.4f}")
best_params = max(results, key=lambda x: x['val_accuracy'])
print(f"Best parameters: {best_params}")

Batch size: 50, Epochs: 3, Learning rate: 0.001, Average_val_loss: 0.6319688534736634, Validation Accuracy: 0.6866
Batch size: 50, Epochs: 3, Learning rate: 0.005, Average_val_loss: 0.465954367518425, Validation Accuracy: 0.7746
Batch size: 50, Epochs: 3, Learning rate: 0.01, Average_val_loss: 0.736927616596222, Validation Accuracy: 0.6784
Batch size: 50, Epochs: 5, Learning rate: 0.001, Average_val_loss: 0.6231666350364685, Validation Accuracy: 0.6940
Batch size: 50, Epochs: 5, Learning rate: 0.005, Average_val_loss: 0.39970849856734275, Validation Accuracy: 0.8136
Batch size: 50, Epochs: 5, Learning rate: 0.01, Average_val_loss: 0.493350368142128, Validation Accuracy: 0.7824
Batch size: 50, Epochs: 7, Learning rate: 0.001, Average_val_loss: 0.5726725146174431, Validation Accuracy: 0.7190
Batch size: 50, Epochs: 7, Learning rate: 0.005, Average_val_loss: 0.41256222277879717, Validation Accuracy: 0.8058
Batch size: 50, Epochs: 7, Learning rate: 0.01, Average_val_loss: 0.346230031996965

### Q4

In [210]:
import torch
import torch.nn as nn
import torch.optim as optim

class Elman(nn.Module):
    def __init__(self, insize=300, outsize=300, hsize=300):
        super().__init__()
        self.lin1 = nn.Linear(insize * 2, hsize)
        self.lin2 = nn.Linear(hsize, outsize)
        self.relu = nn.ReLU()

    def forward(self, x, hidden=None):
        b, t, e = x.size()

        if hidden is None:
            hidden = torch.zeros(b, e, dtype=torch.float)
        outs = []
        
        for i in range(t):
            inp = torch.cat([x[:, i, :], hidden], dim=1)
            hidden = self.relu(self.lin1(inp))
            out = self.lin2(hidden)
            outs.append(out[:, None, :])
        return torch.cat(outs, dim=1), hidden

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.elman = Elman(embedding_dim, hidden_dim)
        self.linear_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_text):
        embedded = self.embedding(input_text)
        elman_out, _ = self.elman(embedded)
        pooled_output, _ = torch.max(elman_out, dim=1)
        output = self.linear_out(pooled_output)
        return output


In [None]:


torch.manual_seed(0)
vocab_size = len(i2w)
embedding_dim = 300
hidden_size = 300
output_dim = 2
param_grid = {
    'batch_size': [100, 300, 500],
    'epochs': [3, 5, 7],
    'learning_rate': [0.001, 0.005, 0.01]
}

# Create parameter grid
grid = ParameterGrid(param_grid)
results = []

for params in grid:
    batch_size = params['batch_size']
    epochs = params['epochs']
    learning_rate = params['learning_rate']
    model = RNN(vocab_size, embedding_dim, hidden_size, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    print(f"Init Batch size: {batch_size}, Epochs: {epochs}, Learning rate: {learning_rate}")
    # Assuming x_train, y_train, x_val, y_val are already on the same device (GPU)
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        print(f"Epoch {epoch + 1}/{epochs}")
        x_train_gen = (x_train[i - batch_size:i] for i in range(batch_size, len(x_train), batch_size - 1))
        y_train_gen = (y_train[i - batch_size:i] for i in range(batch_size, len(y_train), batch_size - 1))

        for step in range(int(len(x_train) / batch_size)):
            x_train_batch, y_train_batch = (
                batch_padding(next(x_train_gen)),
                torch.tensor(next(y_train_gen), dtype=torch.long),
            )

            optimizer.zero_grad()

            outputs = model(x_train_batch)

            loss = criterion(outputs.squeeze(), y_train_batch)

            loss.backward()

            optimizer.step()

            total_loss += loss.item()
            print(f"Step {step + 1}/{int(len(x_train) / batch_size)}, Loss: {loss.item():.4f}")
        average_loss = total_loss / (len(x_train) / batch_size)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")

        # Validation Loop
        model.eval()
        val_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        # Reset the generators
        x_val_gen = (x_val[i - batch_size:i] for i in range(batch_size, len(x_val), batch_size - 1))
        y_val_gen = (y_val[i - batch_size:i] for i in range(batch_size, len(y_val), batch_size - 1))

        with torch.no_grad():
            for val_step in range(int(len(x_val) / batch_size)):
                x_val_batch, y_val_batch = (
                    batch_padding(next(x_val_gen)),
                    torch.tensor(next(y_val_gen), dtype=torch.long),
                )

                outputs = model(x_val_batch)
                loss = criterion(outputs.squeeze(), y_val_batch)
                val_loss += loss.item()

                predictions = torch.argmax(torch.softmax(outputs, 1), dim=1)
                correct_predictions += (predictions == y_val_batch).sum().item()
                total_samples += y_val_batch.size(0)

        average_val_loss = val_loss / (len(x_val) / batch_size)
        val_accuracy = correct_predictions / total_samples
        #print(f"Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        results.append({
        'batch_size': batch_size,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'average_val_loss': average_val_loss,
        'val_accuracy': val_accuracy
    })
    print(f"Batch size: {batch_size}, Epochs: {epochs}, Learning rate: {learning_rate}, Average_val_loss: {average_val_loss}, Validation Accuracy: {val_accuracy:.4f}")
best_params = max(results, key=lambda x: x['val_accuracy'])
print(f"Best parameters: {best_params}")

In [217]:
torch.manual_seed(0)
vocab_size = len(i2w)
embedding_dim = 300
hidden_size = 300
output_dim = 2
batch_size = 100
epochs = 1
learning_rate = 0.005

# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Instantiate the model and move it to the device
model = RNN(vocab_size, embedding_dim, hidden_size, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    
    # create batch generators in steps of batch_size-1
    x_train_gen = (x_train[i-batch_size:i] for i in range(batch_size, len(x_train), batch_size-1))
    y_train_gen = (y_train[i-batch_size:i] for i in range(batch_size, len(y_train), batch_size-1))
    
    x_val_gen = (x_val[i-batch_size:i] for i in range(batch_size, len(x_val), batch_size-1))
    y_val_gen = (y_val[i-batch_size:i] for i in range(batch_size, len(y_val), batch_size-1))
    
    # Loop over batches
    for step in range(int(len(x_train)/batch_size)):
        print(f"Batch step: {step+1}")
        # Extract batch and perform padding
        x_train_batch, y_train_batch = (batch_padding(next(x_train_gen)).to(device),
                                        torch.tensor(next(y_train_gen), dtype=torch.long).to(device))
        
        # Zero the gradients
        optimizer.zero_grad()
        
         # Forward pass
        outputs = model(x_train_batch)

        # Calculate the loss
        loss = criterion(outputs.squeeze(), y_train_batch)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / (len(x_train) / batch_size)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}")
    
    # Model Validation
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Turn of gradient for faster computation
    with torch.no_grad():
        for val_step in range(int(len(x_val)/batch_size)):
            # print(f"Val batch step: {val_step}")
            # Extract validation batches
            x_val_batch, y_val_batch = (batch_padding(next(x_val_gen)).to(device),
                                        torch.tensor(next(y_val_gen), dtype=torch.long).to(device))

            # Forward pass and compute loss
            outputs = model(x_val_batch)
            loss = criterion(outputs.squeeze(), y_val_batch)
            val_loss += loss.item()

            # Compare predictions with ground-truth
            predictions = torch.argmax(torch.softmax(outputs,1),dim=1)
            correct_predictions += (predictions == y_val_batch).sum().item()
            total_samples += y_val_batch.size(0)

    average_val_loss = val_loss / (len(x_val) / batch_size)
    val_accuracy = correct_predictions / total_samples

    print(f"Validation Loss: {average_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

Batch step: 1


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument tensors in method wrapper_CUDA_cat)