# Богданов Александр Иванович, Б05-003

## Модель автокодировщика

In [23]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import torch
from torch.utils.data import TensorDataset
from torch.utils.tensorboard import SummaryWriter

from nltk.tokenize import RegexpTokenizer
from prettytable import PrettyTable

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Вспомогательные функции

In [25]:
class Tokenizer(object):
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]'] + s + ['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]'] + s[:max_length] + ['[SEP]'] \
                  for s in tokens ]
        ids = [[self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in sent] for sent in tokens]
        return torch.tensor(ids)

In [26]:
def word_dict(dataset_train, trashhold=1):
    helps = {}
    for sent in tqdm(dataset_train.values[:, 1]):
        for word in RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+').tokenize(sent):
            if word in helps:
                helps[word] += 1
            else:
                helps[word] = 1
    
    word2idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3}
    idx2word = {0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[SEP]'}
    for elem, number in helps.items():
        if number >= trashhold and elem not in word2idx:
            word2idx[elem] = len(word2idx)
            idx2word[len(idx2word)] = elem
    
    return word2idx, idx2word

In [27]:
def check(batch_size, dataset, model, loss_function, idx2word):
    
    model.eval()
    
    batch_generator = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size)
            
    test_loss = 0
    for it, (x_batch, y_batch) in enumerate(batch_generator):
        x_batch = x_batch.to(model.device)
        y_batch = y_batch.to(model.device)
                
        output = model(x_batch)

        test_loss += loss_function(output.transpose(1,2), y_batch).cpu().item()*len(x_batch)
      
    test_loss /= len(dataset)

    print(f'loss: {test_loss}')
    
    dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
    
    x, y = next(iter(dataloader))
    x = x.to(device)
    y = y.to(device)

    outputs = model(x)
    
    one_x = x[0].cpu().numpy()
    one_output = outputs[0].argmax(dim=-1).cpu().numpy()
    
    words = [idx2word[idx] for idx in one_x]
    pred_words = [idx2word[idx] for idx in one_output]

    table = PrettyTable(["Word", "Predict"])
    table.align["Word"], table.align["Predict"] = "l", "l"

    for word, pred in zip(words, pred_words):
        if word != idx2word[word2idx['[PAD]']]:
            table.add_row([word, pred])

    print(table)
    
    return test_loss

In [28]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(model.device))

    loss = loss_function(output.transpose(1,2), y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [29]:
def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

In [30]:
def trainer(count_of_epoch, 
            batch_size, 
            dataset,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True), 
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size> 0))
        
        epoch_loss = train_epoch(train_generator=batch_generator, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})

In [31]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter = 300, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        model.eval()
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            model.eval()
            
            batch_generator = torch.utils.data.DataLoader(dataset=self.dataset, batch_size=self.batch_size)
            
            test_loss = 0
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)

                output = model(x_batch)

                test_loss += self.loss_function(output.transpose(1,2), y_batch).cpu().item()*len(x_batch)
            
            test_loss /= len(self.dataset)

            print(f'\t\tstep={self.step}, train_loss={loss}, val_loss={test_loss}')
            
            self.writer.add_scalar('LOSS/test', test_loss, self.step)
          
    def __call__(self, model, loss):
        return self.forward(model, loss)

## Модель

In [175]:
class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, vocab_dim, emb_dim, latent_dim, num_layers=3, dropout=0, batch_norm=False):
        super(type(self), self).__init__()

        self.emb = torch.nn.Embedding(vocab_dim, emb_dim)
        self.lstm = torch.nn.LSTM(emb_dim, latent_dim, num_layers, dropout=dropout, batch_first=True)
        if batch_norm:
            self.batch_norm = torch.nn.BatchNorm1d(latent_dim)
        else:
            self.batch_norm = None

    def forward(self, x):
        
        out = self.emb(x)
        _, (h, c) = self.lstm(out)
            
        if self.batch_norm is not None:
            out = self.batch_norm(out.transpose(1,2)).transpose(1,2)
                
        out = torch.cat([h, c], dim=-1).transpose(0, 1)[:, -1, :]
        return out   

In [177]:
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, vocab_dim, latent_dim, emb_dim, hidden_dim, num_layers=3, dropout=0, batch_norm=False):
        super(type(self), self).__init__()
        
        self.num_layers = num_layers
        
        self.h0 = torch.nn.Linear(latent_dim, hidden_dim)
        self.c0 = torch.nn.Linear(latent_dim, hidden_dim)
        
        self.emb = torch.nn.Embedding(1, emb_dim)
        
        self.lstm = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        if batch_norm:
            self.batch_norm = nn.BatchNorm1d(emb_dim)
        else:
            self.batch_norm = None
        self.linear = torch.nn.Linear(hidden_dim, vocab_dim)

    def forward(self, latent_vector):
        
        h = self.h0(latent_vector).unsqueeze(0).repeat(self.num_layers, 1, 1)
        c = self.c0(latent_vector).unsqueeze(0).repeat(self.num_layers, 1, 1)
        
        emb = self.emb(torch.zeros(len(latent_vector), 1).long())
        
        logits = []
        for i in range(12):
            out, (h, c) = self.lstm(emb, (h, c))
                
            if self.batch_norm is not None:
                out = self.batch_norm(out.transpose(1,2)).transpose(1,2)
                
            logits.append(out[:,-1,:])
        
        out = torch.stack(logits, 1)
        out = self.linear(out)
        return out

In [178]:
class Autoencoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, vocab_dim, emb_dim, latent_dim, hidden_dim, num_layers=3, dropout=0, batch_norm=False):
        super(type(self), self).__init__()
        
        self.encoder = Encoder(vocab_dim, emb_dim, latent_dim, num_layers, dropout, batch_norm)
        self.decoder = Decoder(vocab_dim, 2 * latent_dim, emb_dim, hidden_dim, num_layers, dropout, batch_norm)
        
    def forward(self, x):
        return self.decoder(self.encoder(x))

## Подключим tensorboard

In [15]:
%load_ext tensorboard
%tensorboard --logdir tensorboard_3/

## Скачаем данные

In [181]:
dataset = pd.read_csv('twitter.csv')

In [182]:
dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]
dataset = dataset.sample(125000, random_state=42)
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]

In [171]:
word2idx, idx2word = word_dict(dataset_train)

  0%|          | 0/40131 [00:00<?, ?it/s]

In [172]:
tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

In [173]:
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [174]:
dataset_train_pt = TensorDataset(train_data_sent, train_data_sent)
dataset_test_pt = TensorDataset(test_data_sent, test_data_sent)

## Обучение

In [19]:
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam

In [20]:
dim_list = [10, 30, 50]
num_layers_list = [3, 5, 7]
dropout_list = [0, 0.3, 0.5]
batch_norm_list = [False, True]

In [462]:
for dim in dim_list:
    print(f'dim = {dim}')
    
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=dim, latent_dim=dim, hidden_dim=dim)
    model.to(device)
    
    writer = SummaryWriter(log_dir=f'tensorboard_3/hidden_dim_{dim}')
    call = callback(writer, dataset_test_pt, loss_function)
    
    check(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1, 
            batch_size=64, 
            dataset=dataset_train_pt,
            model=model, 
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check(64, dataset_test_pt, model, loss_function, idx2word)

dim = 10
loss: 11.563268907107823, acc: 0.0
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | LOVATO  |
| @       | LOVATO  |
| Lottie  | LOVATO  |
| does    | LOVATO  |
| have    | LOVATO  |
| but     | LOVATO  |
| I       | LOVATO  |
| Secrets | LOVATO  |
| enough  | LOVATO  |
| ,       | LOVATO  |
| media   | LOVATO  |
| [SEP]   | LOVATO  |
+---------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.299795627593994, val_loss=7.12644046951143, val_acc=0.06318147970852614
		step=600, train_loss=6.401655197143555, val_loss=6.434572429316113, val_acc=0.08333333333333333
		step=900, train_loss=6.4732232093811035, val_loss=6.395022167968507, val_acc=0.08333333333333333
		step=1200, train_loss=6.331718921661377, val_loss=6.372049239371521, val_acc=0.08333333333333333
		step=1500, train_loss=6.281586170196533, val_loss=6.34330854717198, val_acc=0.08333333333333333
loss: 6.333392953804203, acc: 0.16666666666666666
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [CLS]   |
| @     | [SEP]   |
| Your  | [SEP]   |
| tird  | [SEP]   |
| left  | [SEP]   |
| -     | [SEP]   |
| -     | [SEP]   |
| -     | [SEP]   |
| from  | [SEP]   |
| had   | [SEP]   |
| beest | [SEP]   |
| [SEP] | [SEP]   |
+-------+---------+
dim = 30
loss: 11.603923020607093, acc: 0.0
+-----------+----------------+
| Word      | Predict        |
+-----------+----------------+
| 

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=6.270175933837891, val_loss=6.423658425145615, val_acc=0.08333333333333333
		step=600, train_loss=6.227050304412842, val_loss=6.244577080586776, val_acc=0.1661879687250678
		step=900, train_loss=6.067775726318359, val_loss=6.078313124242612, val_acc=0.16664007233657785
		step=1200, train_loss=5.929986953735352, val_loss=5.932060687188538, val_acc=0.16730825488005957
		step=1500, train_loss=5.965603351593018, val_loss=5.8387442323719085, val_acc=0.21379514387532578
loss: 5.81597448007825, acc: 0.2166839529812244
+----------+---------+
| Word     | Predict |
+----------+---------+
| [CLS]    | [CLS]   |
| contact  | @       |
| Axwy     | @       |
| HDQ      | @       |
| Griffin  | -       |
| 2        | -       |
| morning  | -       |
| spamming | -       |
| exam     | [SEP]   |
| tweeties | [SEP]   |
| better   | [SEP]   |
| [SEP]    | [SEP]   |
+----------+---------+
dim = 50
loss: 11.618581306959085, acc: 3.3242912611031327e-06
+----------+---------+
| Word

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=6.355953693389893, val_loss=6.237808837224103, val_acc=0.1661879687250678
		step=600, train_loss=5.834472179412842, val_loss=5.964174136716783, val_acc=0.1666866124142333
		step=900, train_loss=5.958387851715088, val_loss=5.8269660668033, val_acc=0.2138150896228924
		step=1200, train_loss=5.636808395385742, val_loss=5.769651941159743, val_acc=0.23065594915164087
		step=1500, train_loss=5.630359649658203, val_loss=5.7268549810476195, val_acc=0.2333585979469177
loss: 5.7233956420518615, acc: 0.23084543375352376
+--------------+---------+
| Word         | Predict |
+--------------+---------+
| [CLS]        | [CLS]   |
| issues       | @       |
| better       | [CLS]   |
| cheerleaderr | I       |
| plastered    | -       |
| hurting      | -       |
| SouthernBets | -       |
| spamming     | -       |
| and          | -       |
| controll     | [PAD]   |
| jaz          | [SEP]   |
| [SEP]        | [SEP]   |
+--------------+---------+


Скорее всего очень простая модель для этой задачи, качество не очень. Но, чем больше размерность, тем лучше.

In [465]:
for num_layers in num_layers_list:
    print(f'num_layers = {num_layers}')
    
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=128, latent_dim=128, hidden_dim=128, num_layers=num_layers)
    model.to(device)
    
    writer = SummaryWriter(log_dir=f'tensorboard_3/num_layers_{num_layers}')
    call = callback(writer, dataset_test_pt, loss_function)
    
    check(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1, 
            batch_size=64, 
            dataset=dataset_train_pt,
            model=model, 
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check(64, dataset_test_pt, model, loss_function, idx2word)

num_layers = 3
loss: 11.578179473870867, acc: 0.0
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | JMMcCoy |
| @       | JMMcCoy |
| Elena   | JMMcCoy |
| Park    | JMMcCoy |
| -       | JMMcCoy |
| Feet    | JMMcCoy |
| gallito | ewi     |
| [SEP]   | ewi     |
+---------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.106370449066162, val_loss=7.126648493849415, val_acc=0.08333333333333333
		step=600, train_loss=6.496007442474365, val_loss=6.4371417085767195, val_acc=0.08333333333333333
		step=900, train_loss=6.489595890045166, val_loss=6.398349907897735, val_acc=0.08333333333333333
		step=1200, train_loss=6.308361530303955, val_loss=6.3605779610088, val_acc=0.08333333333333333
		step=1500, train_loss=6.266372203826904, val_loss=6.327354115619602, val_acc=0.16666666666666666
loss: 6.324021055231601, acc: 0.16666666666666666
+----------+---------+
| Word     | Predict |
+----------+---------+
| [CLS]    | [CLS]   |
| @        | [SEP]   |
| flyziks  | [SEP]   |
| house    | [SEP]   |
| halliday | [SEP]   |
| working  | [SEP]   |
| ICE      | [SEP]   |
| pains    | [SEP]   |
| -        | [SEP]   |
| nieuwste | [SEP]   |
| -        | [SEP]   |
| [SEP]    | [SEP]   |
+----------+---------+
num_layers = 5
loss: 11.620166636738306, acc: 0.0
+-----------+---------------+
| Word     

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.0511016845703125, val_loss=7.159190635200282, val_acc=0.08333333333333333
		step=600, train_loss=6.418607711791992, val_loss=6.449256312548496, val_acc=0.08333333333333333
		step=900, train_loss=6.348978519439697, val_loss=6.4002232479292065, val_acc=0.08333333333333333
		step=1200, train_loss=6.201016902923584, val_loss=6.364319235091196, val_acc=0.08333333333333333
		step=1500, train_loss=6.471507549285889, val_loss=6.331271079961089, val_acc=0.16666666666666666
loss: 6.323718598418893, acc: 0.16666666666666666
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [CLS]   |
| 2     | [SEP]   |
| Some  | [SEP]   |
| and   | [SEP]   |
| Fell  | [SEP]   |
| -     | [SEP]   |
| -     | [SEP]   |
| -     | [SEP]   |
| Jeff  | [SEP]   |
| hendy | [SEP]   |
| hendy | [SEP]   |
| [SEP] | [SEP]   |
+-------+---------+
num_layers = 7
loss: 11.579867020736799, acc: 0.0
+--------------+------------+
| Word         | Predict    |
+--------------+----------

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.214671611785889, val_loss=7.196025737986451, val_acc=0.014646827296420403
		step=600, train_loss=6.351457595825195, val_loss=6.434693607432584, val_acc=0.08333333333333333
		step=900, train_loss=6.39912748336792, val_loss=6.3942550520244845, val_acc=0.08333333333333333
		step=1200, train_loss=6.149362087249756, val_loss=6.363613786205866, val_acc=0.08333333333333333
		step=1500, train_loss=6.497238636016846, val_loss=6.3277928188160235, val_acc=0.16666666666666666
loss: 6.322121474623813, acc: 0.16666666666666666
+--------------+---------+
| Word         | Predict |
+--------------+---------+
| [CLS]        | [CLS]   |
| @            | [SEP]   |
| [UNK]        | [SEP]   |
| Hoping       | [SEP]   |
| unwired      | [SEP]   |
| Gaga         | [SEP]   |
| practically  | [SEP]   |
| csellmybelle | [SEP]   |
| wants        | [SEP]   |
| [SEP]        | [SEP]   |
+--------------+---------+


При увеличении количества слоев, модель становится тяжелее и ей нужно будет больше эпох, чтобы обучиться. Но скорее всего она обучится до лучших результатов.

In [466]:
for dropout in dropout_list:
    print(f'dropout = {dropout}')
    
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=128, latent_dim=128, hidden_dim=128, dropout=dropout)
    model.to(device)
    
    writer = SummaryWriter(log_dir=f'tensorboard_3/dropout_{dropout}')
    call = callback(writer, dataset_test_pt, loss_function)
    
    check(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1, 
            batch_size=64, 
            dataset=dataset_train_pt,
            model=model, 
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check(64, dataset_test_pt, model, loss_function, idx2word)

dropout = 0
loss: 11.63650400345265, acc: 0.0
+-------------+---------------+
| Word        | Predict       |
+-------------+---------------+
| [CLS]       | reedalexander |
| @           | reedalexander |
| pepsicans   | reedalexander |
| promogeorge | reedalexander |
| old         | newLeaks      |
| had         | newLeaks      |
| are         | newLeaks      |
| media       | newLeaks      |
| m           | newLeaks      |
| [SEP]       | newLeaks      |
+-------------+---------------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.339432239532471, val_loss=7.270501530453864, val_acc=0.08333333333333333
		step=600, train_loss=6.399893283843994, val_loss=6.454840982551873, val_acc=0.08333333333333333
		step=900, train_loss=6.462089538574219, val_loss=6.40022532708226, val_acc=0.08333333333333333
		step=1200, train_loss=6.333353519439697, val_loss=6.3793331764523105, val_acc=0.08333333333333333
		step=1500, train_loss=6.421374797821045, val_loss=6.342741963204079, val_acc=0.08333333333333333
loss: 6.3379026471246505, acc: 0.08333333333333333
+----------+---------+
| Word     | Predict |
+----------+---------+
| [CLS]    | [SEP]   |
| @        | [SEP]   |
| Sdh      | [SEP]   |
| hiks     | [SEP]   |
| hendy    | [SEP]   |
| HOMETOWN | [SEP]   |
| hopefull | [SEP]   |
| use      | [SEP]   |
| on       | [SEP]   |
| Duty     | [SEP]   |
| hendy    | [SEP]   |
| [SEP]    | [SEP]   |
+----------+---------+
dropout = 0.3
loss: 11.625426612593483, acc: 0.0
+-----------+---------+
| Word      | Pr

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.194221496582031, val_loss=7.211782546216114, val_acc=0.08333333333333333
		step=600, train_loss=6.362119197845459, val_loss=6.4310295197197895, val_acc=0.08333333333333333
		step=900, train_loss=6.431772232055664, val_loss=6.388828406859685, val_acc=0.08333333333333333
		step=1200, train_loss=6.312370777130127, val_loss=6.354995869973905, val_acc=0.08333333333333333
		step=1500, train_loss=6.258993625640869, val_loss=6.32839735018041, val_acc=0.16666666666666666
loss: 6.3249012066147365, acc: 0.16666666666666666
+--------+---------+
| Word   | Predict |
+--------+---------+
| [CLS]  | [CLS]   |
| @      | [SEP]   |
| awl    | [SEP]   |
| louise | [SEP]   |
| final  | [SEP]   |
| f      | [SEP]   |
| had    | [SEP]   |
| are    | [SEP]   |
| keep   | [SEP]   |
| m      | [SEP]   |
| I      | [SEP]   |
| [SEP]  | [SEP]   |
+--------+---------+
dropout = 0.5
loss: 11.638147595473905, acc: 0.0
+----------+------------+
| Word     | Predict    |
+----------+--------

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.387423992156982, val_loss=7.238711191627499, val_acc=0.03973857773522685
		step=600, train_loss=6.224098205566406, val_loss=6.445171303168541, val_acc=0.08333333333333333
		step=900, train_loss=6.300529956817627, val_loss=6.395610649239147, val_acc=0.08333333333333333
		step=1200, train_loss=6.27586030960083, val_loss=6.371978462950282, val_acc=0.08333333333333333
		step=1500, train_loss=6.527946949005127, val_loss=6.3252047692757385, val_acc=0.16666666666666666
loss: 6.322092158687501, acc: 0.16666666666666666
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| @       | [SEP]   |
| [UNK]   | [SEP]   |
| Secrets | [SEP]   |
| what    | [SEP]   |
| -       | [SEP]   |
| [SEP]   | [SEP]   |
+---------+---------+


Как и ожидалось, Dropout не улучшает качество обучения модели - он нужен для регуляризации.

In [467]:
for batch_norm in batch_norm_list:
    print(f'batch_norm = {batch_norm}')
    
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=128, latent_dim=128, hidden_dim=128, batch_norm=batch_norm)
    model.to(device)
    
    writer = SummaryWriter(log_dir=f'tensorboard_3/batch_norm_{batch_norm}')
    call = callback(writer, dataset_test_pt, loss_function)
    
    check(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1, 
            batch_size=64, 
            dataset=dataset_train_pt,
            model=model, 
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check(64, dataset_test_pt, model, loss_function, idx2word)

batch_norm = False
loss: 11.592061482416264, acc: 0.0
+------------+----------+
| Word       | Predict  |
+------------+----------+
| [CLS]      | hostages |
| sell       | hostages |
| better     | jsyk     |
| vegetarian | juror    |
| s          | juror    |
| better     | juror    |
| Paso       | juror    |
| hurting    | juror    |
| bff        | juror    |
| put        | juror    |
| -          | juror    |
| [SEP]      | juror    |
+------------+----------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.217648029327393, val_loss=7.180011480452247, val_acc=0.08333333333333333
		step=600, train_loss=6.3778395652771, val_loss=6.430727429950715, val_acc=0.08333333333333333
		step=900, train_loss=6.323089599609375, val_loss=6.384221692432984, val_acc=0.08333333333333333
		step=1200, train_loss=6.261751651763916, val_loss=6.362088769213024, val_acc=0.08333333333333333
		step=1500, train_loss=6.411561489105225, val_loss=6.334267057184779, val_acc=0.16666666666666666
loss: 6.329276874530671, acc: 0.16666666666666666
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | [CLS]   |
| @       | [SEP]   |
| CE      | [SEP]   |
| gone    | [SEP]   |
| nap     | [SEP]   |
| 2       | [SEP]   |
| life    | [SEP]   |
| journal | [SEP]   |
| I       | [SEP]   |
| WE      | [SEP]   |
| ,       | [SEP]   |
| [SEP]   | [SEP]   |
+---------+---------+
batch_norm = True
loss: 11.585652138031477, acc: 0.0
+------------+-------------+
| Word       | Predict    

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.847625255584717, val_loss=7.751521928850232, val_acc=0.27943659911706825
		step=600, train_loss=5.582111835479736, val_loss=5.552311638125548, val_acc=0.30870365937982025
		step=900, train_loss=5.070997714996338, val_loss=5.287894881549627, val_acc=0.3286228126163502
		step=1200, train_loss=5.294387340545654, val_loss=5.210344064857698, val_acc=0.3330740386149673
		step=1500, train_loss=4.986623764038086, val_loss=5.155622458028056, val_acc=0.3425515930003723
loss: 5.116979391738067, acc: 0.3371296739535131
+------------+---------+
| Word       | Predict |
+------------+---------+
| [CLS]      | [CLS]   |
| I          | @       |
| random     | I       |
| jessv      | ,       |
| studying   | -       |
| Elliptical | -       |
| german     | -       |
| this       | ,       |
| will       | ,       |
| ran        | ,       |
| on         | wants   |
| [SEP]      | [SEP]   |
+------------+---------+


Добавление BatchNorm сильно улучшило качество модели, что ожидаемо.

In [477]:
for trashhold in [1, 2, 3]:
    print(f'trashhold = {trashhold}')
    
    word2idx, idx2word = word_dict(dataset_train, trashhold=trashhold)
    tokenizer = Tokenizer(word2idx, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))
    train_data_sent = tokenizer(dataset_train.values[:, 1])
    test_data_sent = tokenizer(dataset_test.values[:, 1])
    dataset_train_pt = TensorDataset(train_data_sent, train_data_sent)
    dataset_test_pt = TensorDataset(test_data_sent, test_data_sent)
    
    model = Autoencoder(vocab_dim=len(word2idx), emb_dim=128, latent_dim=128, hidden_dim=128,)
    model.to(device)
    
    writer = SummaryWriter(log_dir=f'tensorboard_3/trashhold_{trashhold}')
    call = callback(writer, dataset_test_pt, loss_function)
    
    check(64, dataset_test_pt, model, loss_function, idx2word)
    trainer(count_of_epoch=1, 
            batch_size=64, 
            dataset=dataset_train_pt,
            model=model, 
            loss_function=loss_function,
            optimizer = optimizer,
            lr=0.001,
            callback = call)
    check(64, dataset_test_pt, model, loss_function, idx2word)

trashhold = 1


  0%|          | 0/99932 [00:00<?, ?it/s]

loss: 11.641769719005906, acc: 0.0
+------------+-----------+
| Word       | Predict   |
+------------+-----------+
| [CLS]      | Dominos   |
| @          | Dominos   |
| ailynonyou | Dominos   |
| i          | Dominos   |
| cant       | hotpocket |
| sleep      | hotpocket |
| !          | hotpocket |
| keep       | pined     |
| me         | pined     |
| company    | pined     |
| [UNK]      | pined     |
| [SEP]      | pined     |
+------------+-----------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=7.452328205108643, val_loss=7.32226697097883, val_acc=0.08333333333333333
		step=600, train_loss=6.661736965179443, val_loss=6.5412748315335465, val_acc=0.0
		step=900, train_loss=6.441305160522461, val_loss=6.495719717548322, val_acc=0.08333333333333333
		step=1200, train_loss=6.388115406036377, val_loss=6.457355269941394, val_acc=0.16666666666666666
		step=1500, train_loss=6.325118541717529, val_loss=6.42464259193144, val_acc=0.16666666666666666
loss: 6.419886765343846, acc: 0.16666666666666666
+--------+---------+
| Word   | Predict |
+--------+---------+
| [CLS]  | [CLS]   |
| I      | [SEP]   |
| '      | [SEP]   |
| m      | [SEP]   |
| soooo  | [SEP]   |
| bored  | [SEP]   |
| .      | [SEP]   |
| just   | [SEP]   |
| fuckin | [SEP]   |
| got    | [SEP]   |
| a      | [SEP]   |
| [SEP]  | [SEP]   |
+--------+---------+
trashhold = 2


  0%|          | 0/99932 [00:00<?, ?it/s]

loss: 10.418727183733829, acc: 2.327003882772193e-05
+---------+---------+
| Word    | Predict |
+---------+---------+
| [CLS]   | Danny   |
| Lol     | Danny   |
| no      | Danny   |
| problem | Danny   |
| !       | Danny   |
| K       | Danny   |
| talk    | Danny   |
| 2       | Danny   |
| u       | Danny   |
| manana  | Danny   |
| !       | Danny   |
| [SEP]   | Danny   |
+---------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=6.464632034301758, val_loss=6.348317198649117, val_acc=0.08333333333333333
		step=600, train_loss=5.85308837890625, val_loss=5.821488175377694, val_acc=0.08333333333333333
		step=900, train_loss=6.065928936004639, val_loss=5.778623581769574, val_acc=0.08333333333333333
		step=1200, train_loss=5.964488983154297, val_loss=5.755351783480963, val_acc=0.08333333333333333
		step=1500, train_loss=5.809302806854248, val_loss=5.728145563071741, val_acc=0.16666666666666666
loss: 5.723182030646296, acc: 0.16666666666666666
+-------------+---------+
| Word        | Predict |
+-------------+---------+
| [CLS]       | [CLS]   |
| @           | [SEP]   |
| clarasdiary | [SEP]   |
| i           | [SEP]   |
| live        | [SEP]   |
| in          | [SEP]   |
| #           | [SEP]   |
| [UNK]       | [SEP]   |
| i           | [SEP]   |
| know        | [SEP]   |
| .           | [SEP]   |
| [SEP]       | [SEP]   |
+-------------+---------+
trashhold = 3


  0%|          | 0/99932 [00:00<?, ?it/s]

loss: 10.000449587211895, acc: 0.0
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | Sue     |
| ND    | Sue     |
| TO    | kaffy   |
| GET   | kaffy   |
| READY | kaffy   |
| FOR   | kaffy   |
| WORK  | kaffy   |
| [SEP] | kaffy   |
+-------+---------+


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1562 [00:00<?, ?it/s]

		step=300, train_loss=6.173194408416748, val_loss=6.167616246818298, val_acc=0.08333333333333333
		step=600, train_loss=5.7625203132629395, val_loss=5.714588714935609, val_acc=0.08333333333333333
		step=900, train_loss=5.7790608406066895, val_loss=5.67896885177213, val_acc=0.08333333333333333
		step=1200, train_loss=5.850861072540283, val_loss=5.662266826979639, val_acc=0.08333333333333333
		step=1500, train_loss=5.795910358428955, val_loss=5.639554495329832, val_acc=0.08333333333333333
loss: 5.6348826999734305, acc: 0.08333333333333333
+-------+---------+
| Word  | Predict |
+-------+---------+
| [CLS] | [SEP]   |
| @     | [SEP]   |
| [UNK] | [SEP]   |
| [UNK] | [SEP]   |
| what  | [SEP]   |
| up    | [SEP]   |
| I     | [SEP]   |
| leave | [SEP]   |
| July  | [SEP]   |
| 7     | [SEP]   |
| th    | [SEP]   |
| [SEP] | [SEP]   |
+-------+---------+


При уменьшении размера словаря модель лучше улавливает ключевые слова.

Были проведены эксперименты над автоинкодером с подбором параметров.