# 01_Library

## install

In [1]:
try:
    import torchtext
except ImportError:
    ! pip install -q torchtext==0.17.0
    import torchtext

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m799.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## import

In [18]:
import os
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset#, random_split

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

----------

# 02_Utils

## system

In [3]:
system= "colab"

if system== "local":
    project_path= r"./"
    dataset_path= './wikitext-2/'

elif system== "colab":
    root_path= '/content/'
    project_path= r"/content/drive/MyDrive/Catalist/1_language modeling/"
    dataset_path= os.path.join(project_path, r'dataset/wikitext-2/')

else:
  raise ValueError("Invalid system")


## device

In [4]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## drive mount

In [5]:
if system== "colab":
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


## number of params fn

In [None]:
def num_trainable_params(model):
  nums= sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

--------

# 03_Data

In [6]:
class WikiTextDataset:
    def __init__(self, file_path):
        self.file_path= file_path

    def __iter__(self):
        with open(self.file_path, 'r', encoding= 'utf-8') as f:
            for line in f:
                yield line.strip()

train_iter= WikiTextDataset(os.path.join(dataset_path, "wiki.train.tokens"))
valid_iter= WikiTextDataset(os.path.join(dataset_path, "wiki.valid.tokens"))
test_iter= WikiTextDataset(os.path.join(dataset_path, "wiki.test.tokens"))

train_iter_= iter(train_iter)
print(next(train_iter_))
print(next(train_iter_))


= Valkyria Chronicles III =


-------

# 04_Tokenize & Vocab

In [7]:
tokenizer= get_tokenizer('basic_english')
vocab= build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
torch.save(vocab, 'vocab.pt')
vocab(['amir', 'hi', 'rookie', 'fouladi'])

[0, 9206, 6358, 0]

-----

# 05_EDA

## mean sentence length

In [8]:
total_sentence_count= 0
total_sentence_length= 0

for line in train_iter:
    sentences= line.split('.')

    for sentence in sentences:
        tokens= sentence.strip().split()
        sentence_length= len(tokens)

        if sentence_length >0:
            total_sentence_count +=1
            total_sentence_length +=sentence_length

mean_sentence_length= total_sentence_length/ total_sentence_count

print(f'Mean sentence length in Wikitext-2: {mean_sentence_length:.2f}')

Mean sentence length in Wikitext-2: 21.69


## most common and least common words

In [9]:
freqs= Counter()
for tokens in map(tokenizer, train_iter):
  freqs.update(tokens)

In [10]:
freqs.most_common()[:20]

[('the', 130768),
 (',', 102615),
 ('.', 83397),
 ('of', 57030),
 ('<unk>', 54625),
 ('and', 50735),
 ('in', 45015),
 ('to', 39521),
 ('a', 36523),
 ('=', 29570),
 ('was', 21008),
 ("'", 18484),
 ('@-@', 16906),
 ('on', 15140),
 ('as', 15058),
 ('s', 14936),
 ('that', 14351),
 ('for', 13794),
 ('with', 13012),
 ('by', 12718)]

In [11]:
freqs.most_common()[-20:]

[('yellowwoods', 3),
 ('tomentum', 3),
 ('secretion', 3),
 ('spruces', 3),
 ('lewenthal', 3),
 ('caprices', 3),
 ('étude', 3),
 ('mineurs', 3),
 ('sonatine', 3),
 ('chants', 3),
 ('philipp', 3),
 ('prefaced', 3),
 ('kreutzer', 3),
 ('forrester', 3),
 ('zoromski', 3),
 ('roundabouts', 3),
 ('tuscola', 3),
 ('northeasterly', 3),
 ('intergrades', 3),
 ('gallinae', 3)]

-------

# 06_Preprocessing

In [12]:
def data_process(raw_text_iter, seq_len):
  data= torch.cat([torch.LongTensor(vocab(tokenizer(line))) for line in raw_text_iter])
  M, r= len(data) // seq_len, len(data) % seq_len
  data= torch.cat((data, torch.LongTensor([0]))) if r==0 else data

  inputs= data[:M* seq_len].reshape(-1, seq_len)
  targets= data[1: M*seq_len +1].reshape(-1, seq_len)

  return inputs, targets

In [13]:
seq_len= 35
x_train, y_train= data_process(train_iter, seq_len)
x_valid, y_valid= data_process(valid_iter, seq_len)
x_test, y_test= data_process(test_iter, seq_len)

x_train.shape, y_train.shape, x_valid.shape, y_valid.shape, x_test.shape, y_test.shape

(torch.Size([58571, 35]),
 torch.Size([58571, 35]),
 torch.Size([6126, 35]),
 torch.Size([6126, 35]),
 torch.Size([6910, 35]),
 torch.Size([6910, 35]))

-------

# 07_Custom Dataset

In [14]:
class CustomDataset(Dataset):

  def __init__(self, inputs, targets):
    self.inputs= inputs
    self.targets= targets

  def __len__(self):
    return self.inputs.shape[0]

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [15]:
train_set= CustomDataset(x_train, y_train)
valid_set= CustomDataset(x_valid, y_valid)
test_set= CustomDataset(x_test, y_test)

--------

# 08_DataLoader

In [16]:
batch_size= 20
train_loader= DataLoader(train_set, batch_size= batch_size, shuffle= True)
valid_loader= DataLoader(valid_set, batch_size= 2*batch_size, shuffle= False)
test_loader= DataLoader(test_set, batch_size= 2*batch_size, shuffle= False)

In [17]:
x_batch, y_batch= next(iter(train_loader))
x_batch.shape, y_batch.shape

(torch.Size([20, 35]), torch.Size([20, 35]))

-------

# 09_Model

In [19]:
class LanguageModel(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
                dropout_embd= 0.5, dropout_rnn= 0.5):
    super().__init__()
    self.num_layers= num_layers
    self.hidden_dim= hidden_dim
    self.embedding_dim= embedding_dim

    self.embedding= nn.Embedding(vocab_size, embedding_dim)
    self.embedding.weight.data.uniform_(-0.1, 0.1)
    self.dropout= nn.Dropout(p=dropout_embd)

    self.lstm= nn.LSTM(embedding_dim, hidden_dim, num_layers= num_layers,
                        dropout= dropout_rnn, batch_first= True)

    self.fc= nn.Linear(hidden_dim, vocab_size)

  def forward(self, src):
    embedding= self.dropout(self.embedding(src))
    output, hidden= self.lstm(embedding)
    prediction= self.fc(output)
    return prediction

In [20]:
model= LanguageModel(vocab_size= len(vocab), embedding_dim= 300,
                     hidden_dim= 512, num_layers= 2,
                     dropout_embd= 0.65, dropout_rnn= 0.5)
model

LanguageModel(
  (embedding): Embedding(28782, 300)
  (dropout): Dropout(p=0.65, inplace=False)
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=28782, bias=True)
)

In [None]:
print(num_trainable_params(model))
print(num_trainable_params(model.embedding))
print(num_trainable_params(model.lstm))
print(num_trainable_params(model.fc))

----------

# 10_