# What is Autoregressive models ?

### Do you remeber the autoencoder ?

The autoencoding approach has been very successful for images, signals, and even fully connected
models with tabular data. But what if our data is a sequence problem? Especially if our data is in a
language represented by discrete tokens, it’s hard to add meaningful noise to things like a letter or
word. Instead, we can use an autoregressive model, which is an approach specifically designed for
time-series problems.

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from utils import View
from utils import train_network
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.datasets import MNIST
from torchvision import transforms
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

  from tqdm.autonotebook import tqdm


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision 
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader

from tqdm.autonotebook import tqdm

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

import pandas as pd

from sklearn.metrics import accuracy_score

import time

# from idlmam import train_network, Flatten, View, weight_reset,  set_seed

In [3]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re
all_data = [] 
resp = urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt")
shakespear_100k = resp.read()
shakespear_100k = shakespear_100k.decode('utf-8').lower()

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
vocab2indx = {}
for char in shakespear_100k:
    if char not in vocab2indx:
        vocab2indx[char] = len(vocab2indx)
indx2vocab = {}
for k, v in vocab2indx.items():
    indx2vocab[v] = k
print("Vocab Size: ", len(vocab2indx))
print("Total Characters:", len(shakespear_100k))

Vocab Size:  36
Total Characters: 99993


In [6]:
class AutoRegressiveDataset(Dataset):
    """
    Creates an autoregressive dataset from one single, long, source sequence by breaking it up into "chunks". 
    """

    def __init__(self, large_string, MAX_CHUNK=500):
        """
        large_string: the original long source sequence that chunks will be extracted from
        MAX_CHUNK: the maximum allowed size of any chunk. 
        """
        self.doc = large_string
        self.MAX_CHUNK = MAX_CHUNK

    def __len__(self):
        #The number of items is the number of characters divided by chunk size
        return (len(self.doc)-1) // self.MAX_CHUNK

    def __getitem__(self, idx):
        #Compute the starting position for the idx'th chunk
        start = idx*self.MAX_CHUNK
        #Grab the input sub-string
        sub_string = self.doc[start:start+self.MAX_CHUNK]
        #convert the sub-string into integers based on our vocab
        x = [vocab2indx[c] for c in sub_string]
        
        #grab the label sub-string by shifting over by 1
        sub_string = self.doc[start+1:start+self.MAX_CHUNK+1]
        #convert the label sub-string into integers based on our vocab
        y = [vocab2indx[c] for c in sub_string]
        #convert the 
        return torch.tensor(x, dtype=torch.int64), torch.tensor(y, dtype=torch.int64)

In [7]:
class AutoRegressive(nn.Module):

    def __init__(self, num_embeddings, embd_size, hidden_size, layers=1):
        super(AutoRegressive, self).__init__()
        self.hidden_size = hidden_size
        self.embd = nn.Embedding(num_embeddings, embd_size)
        self.layers = nn.ModuleList([nn.GRUCell(embd_size, hidden_size)] + 
                                     [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
        self.norms = nn.ModuleList([nn.LayerNorm(hidden_size) for i in range(layers)])
        
        self.pred_class = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),# (B, *, D)
            nn.LeakyReLU(),
            nn.LayerNorm(hidden_size), # (B, *, D)
            nn.Linear(hidden_size, num_embeddings) #(B, *. D) -> B(B, *, VocabSize)
        )
        
    def initHiddenStates(self, B):
        """
        Creates an initial hidden state list for the RNN layers. 
        
        B: the batch size for the hidden states. 
        """
        return [torch.zeros(B, self.hidden_size, device=device) for _ in range(len(self.layers))]
        
    def step(self, x_in, h_prevs=None):
        """
        x_in: the input for this current time step and has shape (B) if the values need 
            to be embedded, and (B, D) if they have alreayd been embedded. 

        h_prevs: a list of hidden state tensors each with shape (B, self.hidden_size) for each 
            layer in the network. These contain the current hidden state of the RNN layers and 
            will be updated by this call. 
        """
        #Prep all three arguments to be in the final form
        if len(x_in.shape) == 1: #(B), we need to embed it
            x_in = self.embd(x_in) #now (B, D)

        if h_prevs is None:
            h_prevs = self.initHiddenStates(x_in.shape[0])
        
        #Process the input 
        for l in range(len(self.layers)):
            h_prev = h_prevs[l]
            h = self.norms[l](self.layers[l](x_in, h_prev))

            h_prevs[l] = h
            x_in = h
        #Make predictions about the token
        return self.pred_class(x_in)
    
    def forward(self, input):
        #Input should be (B, T)
        #What is the batch size?
        B = input.size(0)
        #What is the max number of time steps?
        T = input.size(1)
        
        x = self.embd(input) #(B, T, D)
        
        #Initial hidden states
        h_prevs = self.initHiddenStates(B)
        
        last_activations = []
        for t in range(T):
            x_in = x[:,t,:] #(B, D)
            last_activations.append(self.step(x_in, h_prevs))
        
        last_activations = torch.stack(last_activations, dim=1) #(B, T, D)
        
        return last_activations

In [8]:
autoRegData = AutoRegressiveDataset(shakespear_100k, MAX_CHUNK=250)
autoReg_loader = DataLoader(autoRegData, batch_size=128, shuffle=True)

autoReg_model = AutoRegressive(len(vocab2indx), 32, 128, layers=2)
autoReg_model = autoReg_model.to(device)

for p in autoReg_model.parameters():
    p.register_hook(lambda grad: torch.clamp(grad, -2, 2))

In [9]:
def CrossEntLossTime(x, y):
    """
    x: output with shape (B, T, V)
    y: labels with shape (B, T)
    
    """
    cel = nn.CrossEntropyLoss()
    
    T = x.size(1)
    
    loss = 0
    
    for t in range(T):#for every item in the sequence
        loss += cel(x[:,t,:], y[:,t]) #Compute the sum of prediction errors
    
    return loss

In [14]:
train_network(autoReg_model, CrossEntLossTime, autoReg_loader, epochs=50, device=device)

Epoch: 100%|██████████| 50/50 [01:15<00:00,  1.50s/it]


Unnamed: 0,epoch,total time,train loss
0,0,1.749971,757.020416
1,1,3.376275,742.32399
2,2,5.062328,723.743301
3,3,6.682607,704.701569
4,4,8.317533,684.282745
5,5,9.941523,664.779068
6,6,11.367311,647.298874
7,7,12.898011,633.062164
8,8,14.341451,617.980743
9,9,15.834649,605.811142


In [15]:
autoReg_model = autoReg_model.eval()
sampling = torch.zeros((1, 500), dtype=torch.int64, device=device)

In [16]:
seed = "EMILIA:".lower()
cur_len = len(seed)
sampling[0,0:cur_len] = torch.tensor([vocab2indx[x] for x in seed])

In [17]:
for i in tqdm(range(cur_len, sampling.size(1))):
    with torch.no_grad():
        h = autoReg_model(sampling[:,0:i]) #process all the previous items
        h = h[:,-1,:] #Grab the last time step
        h = F.softmax(h, dim=1) #make probabilities
        next_tokens = torch.multinomial(h, 1) #sample the next prediction
        sampling[:,i] = next_tokens #set the next prediction
        #increase the length by one
        cur_len += 1

100%|██████████| 493/493 [00:52<00:00,  9.35it/s]


In [18]:
s = [indx2vocab[x] for x in sampling.cpu().numpy().flatten()]
print("".join(s))

emilia:
a felliabir ullings: ahait ocessices fulled
and thy suybir printries it love givented mermight
truse indeed denity emass kingdariand
dukess shear pary my him.

craindus:
faind up mountiof they tov joingty,
couds poof
the many shund but's hour the will:
as for ga hat dewardy.

suchiress:
feed fate,
but i hert caod:
tentiniady, good inetsulf,
that love: that age,yu lord, let of their of here?

churublet:
i camen you have oft could breenute of foierduphter you bruve hears undands ronielit a


In [19]:
cur_len = len(seed)
temperature = 0.75 #Primary addition, controls the temperature and our sampling behavior
for i in tqdm(range(cur_len, sampling.size(1))):
    with torch.no_grad():
        h = autoReg_model(sampling[:,0:i])
        h = h[:,-1,:] #Grab the last time step
        h = F.softmax(h/temperature, dim=1) #make probabilities
        next_tokens = torch.multinomial(h, 1)
        sampling[:,i] = next_tokens

        cur_len += 1

100%|██████████| 493/493 [00:53<00:00,  9.16it/s]


In [20]:
s = [indx2vocab[x] for x in sampling.cpu().numpy().flatten()]
print("".join(s))

emilia:
he supp
and with the sport then good you know.

pratester:
now a whiles but me light'd heart;
and the such would god polas, and a with hath peater: i have sfeory their you here thruafft commed this. in that prevery unlention stain make with my faurter:
whouth entrans and simble,
and whire thy dank
thene woy.

king secontion:
ge wordd tarry three me as of as hath with thee him addveres, and stand then the fullaght and sight, he forethink when.

piciness:
a must to was boul partanous soule


In [21]:
cur_len = len(seed)
temperature = 0.05 #Very low temp, will almost always pick the most likely items. 
for i in tqdm(range(cur_len, sampling.size(1))):
    with torch.no_grad():
        h = autoReg_model(sampling[:,0:i])
        h = h[:,-1,:] #Grab the last time step
        h = F.softmax(h/temperature, dim=1) #make probabilities
        next_tokens = torch.multinomial(h, 1)
        sampling[:,i] = next_tokens

        cur_len += 1
s = [indx2vocab[x] for x in sampling.cpu().numpy().flatten()]
print("".join(s))

100%|██████████| 493/493 [00:44<00:00, 11.14it/s]

emilia:
the soul shall and the seem the sould to the soul the soul the soul the soul shall heart the prainter the seen the sould to the soul the soul the soul the soul the soul the soul the soul shall and the seen the soul the soul the sould to the soul the soul shall and the soul the soul and the soul the soul the soul the soul the soul the soul the soul the seen the soul the sould to the soul the soul the soul the soul the soul the soul the soul the soul the soul shall and the soul and the sou





In [22]:
#Set up our seed and the location to store the generated content
seed = "EMILIA:".lower()
cur_len = len(seed)
sampling = torch.zeros((1, 500), dtype=torch.int64, device=device)
sampling[0,0:cur_len] = torch.tensor([vocab2indx[x] for x in seed])

#pick a temperature
temperature = 0.75
with torch.no_grad():
    #initialize the hidden state to avoid redundant work
    h_prevs = autoReg_model.initHiddenStates(1)
    #push the seed through
    for i in range(0, cur_len):
        h = autoReg_model.step(sampling[:,i], h_prevs=h_prevs)

    #generate new text one character at a time
    for i in tqdm(range(cur_len, sampling.size(1))):
        h = F.softmax(h/temperature, dim=1) #make probabilities
        next_tokens = torch.multinomial(h, 1)
        sampling[:,i] = next_tokens
        cur_len += 1
        #now push only the new sample into the model
        h = autoReg_model.step(sampling[:,i], h_prevs=h_prevs)

100%|██████████| 493/493 [00:00<00:00, 1098.45it/s]


In [23]:
s = [indx2vocab[x] for x in sampling.cpu().numpy().flatten()]
print("".join(s))

emilia:
i have gracose we my lord, of the pears of thou in dead hamberoughs
maled in eyeray, to know nower, we all be pitent queen in suffir and brach hearth be lught in offence.
good my lord, sir, and made that are and the begnitlus had my husband, and i her.

repralus:
and her such dives this ampleader thy opwill be for well
and the subter seent with see.

valunioo:
and the fellew their well now swear but lagy:
if, if my ambarrower's their that soue thee
fare and are my mound and hillods the c
