<a href="https://colab.research.google.com/github/AhmedZeer/ml.py/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install torchdata
!pip install portalocker

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchdata)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=2->torchdata)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch>=2->torchdat

In [17]:
from torchtext.datasets import IMDB
train_data = IMDB( split = "train" )
test_data = IMDB( split = "test" )

In [18]:
import torch

In [19]:
from torch.utils.data.dataset import random_split

In [20]:
torch.manual_seed(0)

train_set, valid_set = random_split( list(train_data), [20000,5000])

In [21]:
text_ = "<image> MY PERSonAL oBino ! :D"

In [22]:
import re

In [23]:
re.sub("<[^>]*>", '', text_)

' MY PERSonAL oBino ! :D'

In [25]:
def tokenizer( text_ ):
  emotes = re.findall("(?::|=|;)(?:-)?(?:\(|\)|p|d)",text_.lower())
  text_ = re.sub("(?::|=|;)(?:-)?(?:\(|\)|p|d)","",text_.lower())
  text = re.sub('[\W+]',' ', text_.lower()) + ' '.join(emotes).replace('-', '')
  return text.split()

In [26]:
tokenizer(text_)

['image', 'my', 'personal', 'obino', ':d']

In [27]:
re.sub('[\W+]',' ', text_.lower())

' image  my personal obino    d'

In [28]:
from collections import Counter, OrderedDict

In [29]:
token_count = Counter()
for label, line in train_set:
  tokenized = tokenizer(line)
  token_count.update(tokenized)

In [30]:
token_count.__len__()

68451

In [31]:
from torchtext.vocab import vocab

In [32]:
sorted_tuple_freq = sorted( token_count.items(), key = lambda x:x[1], reverse = True  )

In [33]:
ordered_dict = OrderedDict(sorted_tuple_freq)

In [34]:
vocab = vocab(ordered_dict)

In [35]:
vocab.insert_token("<unk>",0)
vocab.insert_token("<pad>",1)

In [36]:
vocab.set_default_index(0)

In [37]:
vocab['osaudfh']

0

In [38]:
text_pipeline = lambda x: [ vocab[token] for token in tokenizer(x) ]
label_pipeline = lambda x: True if x == 'pos' else False

In [39]:
def collate_batch(batch):

  label_list, text_list, lengths = [],[],[]

  for label_, text_ in batch:
    label_list.append(label_pipeline(label_))
    text_ = torch.tensor(text_pipeline(text_), dtype = torch.int64)
    text_list.append(text_)
    lengths.append(text_.size(0))

  label_list = torch.tensor(label_list).float()
  lengths = torch.tensor(lengths)
  text_list = nn.utils.rnn.pad_sequence(text_list,batch_first = True)

  return text_list, label_list, lengths

In [40]:
from torch.utils.data import DataLoader

In [41]:
batch_size = 32
train_dl = DataLoader( train_set, batch_size = batch_size, shuffle = False, collate_fn = collate_batch )
valid_dl = DataLoader( valid_set, batch_size = batch_size, shuffle = False, collate_fn = collate_batch )
test_dl = DataLoader( test_data, batch_size = batch_size, shuffle = False, collate_fn = collate_batch )

In [42]:
import torch.nn as nn

In [43]:
next(iter(train_dl))

(tensor([[   2,  299,  446,  ...,   30,  547, 2865],
         [  12,    7,    2,  ...,    0,    0,    0],
         [  12,   19,   15,  ...,    0,    0,    0],
         ...,
         [  11,  116,  649,  ...,    0,    0,    0],
         [   6,  273,    9,  ...,    0,    0,    0],
         [  12,   19,   15,  ...,    0,    0,    0]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([625, 116, 137, 108, 160, 140,  68,  51, 155,  88, 104, 130, 134, 459,
         579, 126, 154, 309, 331,  58, 220, 199, 160, 489, 145, 184, 188, 175,
         172, 155, 147, 125]))

In [44]:
text_list, label_list, lengths = next(iter(train_dl))
print(text_list.shape)
print(label_list.shape)
print(lengths.shape)

torch.Size([32, 625])
torch.Size([32])
torch.Size([32])


In [45]:
embedding = nn.Embedding( num_embeddings = 10,
                         embedding_dim = 3,
                         padding_idx = 0 )

In [46]:
class RNN(nn.Module):
  def __init__(self, input_layer, hidden_layer):
    super().__init__()
    self.rnn = nn.RNN( input_layer, hidden_layer, num_layers = 2, batch_first = True )
    self.fc = nn.Linear(hidden_layer, 1)

  def forward(self, x):
    _,hidden = self.rnn(x)
    out = hidden[-1,:,:]
    return self.fc(out)

In [47]:
rnn = RNN(64,32)
print(rnn)
tensor = torch.randn(5,3,64)
rnn(tensor)

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[ 0.5131],
        [ 0.2918],
        [ 0.3124],
        [-0.4557],
        [-0.2058]], grad_fn=<AddmmBackward0>)

In [48]:
class RNN(nn.Module):
  def __init__(self, vocab_size, embed_size, rnn_hidden, fc_hidden):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size, padding_idx = 0 )
    self.rnn = nn.LSTM( embed_size, rnn_hidden, batch_first = True )
    self.fc1 = nn.Linear( rnn_hidden, fc_hidden )
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(  fc_hidden, 1 )
    self.sigmoid = nn.Sigmoid()

  def forward( self, text, lengths ):
    out = self.embed(text)
    out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), batch_first = True, enforce_sorted = False )
    out, (hidden,cell) = self.rnn(out)
    out = hidden[-1,:,:]
    out = self.fc1(out)
    out = self.fc2(out)
    out = self.relu(out)
    return self.sigmoid(out)

In [49]:
vocab_size = len(vocab)
vocab_size

68453

In [50]:
embed_size = 32
fc_hidden = 64
rnn_hidden = 64
torch.manual_seed(0)
rnn = RNN(vocab_size, embed_size=embed_size, rnn_hidden = rnn_hidden, fc_hidden = fc_hidden )

In [51]:
rnn

RNN(
  (embed): Embedding(68453, 32, padding_idx=0)
  (rnn): LSTM(32, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [52]:
optim = torch.optim.Adam(rnn.parameters(), lr = 0.001)
loss_fn = nn.BCELoss()
def train(dataloader):
  total_acc = 0
  total_loss = 0
  rnn.train()
  for text_batch, label_batch, lengths in dataloader:
    rnn.zero_grad()
    pred = rnn(text_batch, lengths).squeeze()
    # print(pred)
    # print(label_batch)
    cost = loss_fn(pred,label_batch)
    cost.backward()
    optim.step()
    total_acc  += (( pred >= 0.5 ).float() == label_batch).float().sum()
    total_loss += cost.item() * label_batch.size(0)
  return( total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset) )

In [53]:
def evaluate(dataloader):
  with torch.no_grad:
    for text_batch, label_batch, lengths in dataloader:
      pred = rnn(text_batch, lengths).squeeze()
      cost = loss_fn(pred,label_batch)
      total_acc  += (( pred >= 0.5 ).float() == label_batch).float().sum()
      total_loss += cost.item() * label_batch.size(0)

  return( total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset) )

In [54]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1143k  100 1143k    0     0  4547k      0 --:--:-- --:--:-- --:--:-- 4557k


In [55]:
import numpy as np

In [56]:
with open("/content/1268-0.txt", "r") as fp:
  text = fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx: end_indx]

In [57]:
char_text = set(text)

In [58]:
print("Unique Chars:",len(char_text))
print("Chars:",len(text))

Unique Chars: 85
Chars: 1130711


In [59]:
char_sorted = sorted(char_text)

In [60]:
char2int = {ch:i for i,ch in enumerate(char_sorted)}

In [61]:
char_array = np.array(char_sorted)

In [62]:
encoded_text = np.array([ char2int[ch] for ch in text ])

In [63]:
for ex in encoded_text[:10]:
  print("{} -> {}".format(ex,char_array[ex]))

48 -> T
36 -> H
33 -> E
1 ->  
41 -> M
53 -> Y
47 -> S
48 -> T
33 -> E
46 -> R


In [64]:
import torch
from torch.utils.data import Dataset

In [65]:
seq_len = 40
chunk_size = seq_len + 1
text_chunks = [encoded_text[i:chunk_size+i] for i in range( len(encoded_text) - chunk_size )]

In [66]:
class TextDataset(Dataset):
  def __init__(self, text_chunks):
    self.text_chunks = text_chunks

  def __len__(self):
    return len(self.text_chunks)

  def __getitem__(self,idx):
    text_chunk = self.text_chunks[idx]
    return ( text_chunk[:-1].long(), text_chunk[1:].long() )

seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [67]:
from torch.utils.data import DataLoader

In [68]:
train_dl = DataLoader( seq_dataset, batch_size = 32, shuffle = True, drop_last = True )

In [69]:
import torch.nn as nn

In [70]:
class RNN( nn.Module ):

  def __init__(self, vocab_size, embed_dim, rnn_hidden_num ):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size)
    self.rnn_hidden_size = rnn_hidden_num
    self.rnn = nn.LSTM( embed_size, self.rnn_hidden_size, batch_first = True )
    self.fc = nn.Linear(self.rnn_hidden_size, vocab_size)

  def forward(self, x, hidden, cell):
    out = self.embed(x).unsqueeze(1)
    out, (hidden,cell) = self.rnn(out,(hidden,cell))
    out = self.fc(out).reshape(out.size(0),-1)
    return out, hidden, cell

  def init_hidden(self, batch_size):
    hidden = torch.zeros( 1, batch_size, self.rnn_hidden_size )
    cell   = torch.zeros( 1, batch_size, self.rnn_hidden_size )
    return (hidden, cell)

In [71]:
vocab_size = len(char_array)
embed_size = 256
rnn_hidden_num = 512
model = RNN(vocab_size, embed_size, rnn_hidden_num)

In [72]:
model

RNN(
  (embed): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [73]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RNN(
  (embed): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [74]:
loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(),0.001)

In [75]:
n_epochs = 10000
batch_size = 32
for e in range(n_epochs):
  hidden,cell = model.init_hidden(batch_size)
  hidden,cell = hidden.to(device), cell.to(device)
  seq_batch, target_batch = next(iter(train_dl))
  seq_batch, target_batch = seq_batch.to(device), target_batch.to(device)
  loss = 0
  optim.zero_grad()
  for c in range(seq_len):
    pred, hidden, cell = model(seq_batch[:,c], hidden, cell)
    loss += loss_fn(pred,target_batch[:,c])
  loss.backward()
  optim.step()
  loss = loss.item() / seq_len
  if( e % 100 == 0 ):
    print(loss)

4.422719955444336


KeyboardInterrupt: 

In [76]:
from torch.distributions.categorical import Categorical

In [91]:
def sample(model, starting_str, len_generated = 200, scale_factor = 1 ):

  encoded_input = torch.tensor([char2int[s] for s in starting_str])
  encoded_input = torch.reshape(encoded_input, (1,-1))
  generated_str = starting_str

  model.eval()
  hidden,cell = model.init_hidden(1)
  for c in range(len(starting_str)-1):
    pred, hidden, cell = model(encoded_input[:,c].view(1),hidden,cell)

  last_char = encoded_input[:,-1]
  for i in range(len_generated):
    logits,hidden,cell = model( last_char.view(1), hidden, cell )
    logits = torch.squeeze(logits, 0)
    scaled_logits = logits * scale_factor
    m = Categorical(logits = logits)
    last_char = m.sample()
    generated_str += str(char_array[last_char])

  return generated_str
# print(sample(model,"Hello"))

Hellow might be so,” replied the engineer, might have taken a moments he has all his putrated where their fury, eyes improve from
the clear on a spars right as far in a mercy, striking the necessary
peccar


In [78]:
model.eval()

RNN(
  (embed): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [79]:
with open("lstm.pt", "wb") as f:
  torch.save(model,f)

In [80]:
# prompt: load torch model

with open("/content/drive/MyDrive/lstm.pt", "rb") as f:
  model = torch.load(f,map_location=torch.device('cpu'))


In [81]:
# prompt: change device of model

device = torch.device("cpu")
model.to(device)


RNN(
  (embed): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [95]:
print(sample(model,"Hello", scale_factor = 2))

Hellow, when it was to be feet, was thus
living two granite! Spilett
remarked that the cotcembers tust, but he observed frigates.

On the 7th of October, the rapid part had not served to
have an abundance 
