In [1]:
import numpy as np
import pandas as pd
import csv
import re

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm

In [2]:
def preprocess(text):
    text = text.lower()
    text = ''.join(re.findall(r'[ а-я]', text))
    return text.split()

In [25]:
import urllib.request
from bs4 import BeautifulSoup

def get_page(url):
    page = urllib.request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(page, 'html.parser')
    return soup

def parse_2ch():
    posts = get_page('https://2ch.hk/b/').select('.post_type_oppost a')
    
    links = set()
    for post in posts:
        idx = post.get('id')
        if isinstance(idx, str):
            links.add(idx)
            
    messages = []
    for idx in tqdm(links):
        print(idx)
        raw_messages = get_page(f'https://2ch.hk/b/res/{idx}.html').select('.post__message')
        for message in raw_messages:
            message = preprocess(message.text)
            if len(message) > 10:
                messages.append(message)
    
    return messages

dwach = parse_2ch()

  5%|▍         | 1/22 [00:00<00:02,  7.56it/s]

206076403
206062060


  9%|▉         | 2/22 [00:01<00:08,  2.27it/s]

206073010


 18%|█▊        | 4/22 [00:01<00:05,  3.17it/s]

206076829
206076655


 23%|██▎       | 5/22 [00:01<00:04,  3.72it/s]

206050137


 27%|██▋       | 6/22 [00:02<00:07,  2.22it/s]

206064673


 32%|███▏      | 7/22 [00:03<00:06,  2.16it/s]

206076211


 36%|███▋      | 8/22 [00:08<00:26,  1.87s/it]

206071706


 45%|████▌     | 10/22 [00:09<00:13,  1.11s/it]

206075749
206050630


 55%|█████▍    | 12/22 [00:10<00:08,  1.19it/s]

206076298
206068597


 59%|█████▉    | 13/22 [00:10<00:05,  1.55it/s]

206069753


 64%|██████▎   | 14/22 [00:15<00:13,  1.72s/it]

206065715


 68%|██████▊   | 15/22 [00:15<00:09,  1.31s/it]

206071176


 73%|███████▎  | 16/22 [00:15<00:06,  1.09s/it]

206062608


 82%|████████▏ | 18/22 [00:17<00:03,  1.22it/s]

206076478
206062712


 86%|████████▋ | 19/22 [00:18<00:02,  1.25it/s]

206071000


 91%|█████████ | 20/22 [00:18<00:01,  1.54it/s]

206063953


100%|██████████| 22/22 [00:18<00:00,  1.16it/s]

206073682





In [26]:
bibl = []

with open('bibl.txt', 'r', encoding="windows-1251") as file:
    for line in file:
        line = preprocess(line)
        if len(line) > 10:
            bibl.append(line)

In [41]:
tolstoy = []

with open('tolst.txt', 'r', encoding="windows-1251") as file:
    for line in file:
        line = preprocess(line)
        if len(line) > 10:
            tolstoy.append(line)
tolstoy=tolstoy[:4000]

In [28]:
class Vocab:
    def __init__(self, corpora, tokenizer, max_len=20, vocab_size=5000):
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        counts = {}
        
        for sentence in corpora:
            for token in sentence:
                counts[token] = counts.get(token, 0) + 1
        
        l = sorted(counts.items(), key=lambda x: -x[1])[:vocab_size-2]
        
        print('least used token:', l[-1])
        print('vocab size:', len(l), '(+2)')
        
        self.t2i = {"<pad>" : 0, "<unk>" : 1}
        self.i2t = {0 : "<pad>", 1 : "<unk>"}
              
        for token, _ in l:
            self.i2t[len(self.i2t)] = token
            self.t2i[token] = len(self.t2i)
    
    def tokenize(self, sentence, pad=True):
        if pad:
            sentence = sentence[:self.max_len]
            while len(sentence) < self.max_len:
                sentence.append('<pad>')
        
        indices = []
        for token in sentence:
            if token in self.t2i:
                indices.append(self.t2i[token])
            else:
                indices.append(1)
        
        return indices

In [42]:
# me: mom can we have oversampling?
# mom: but we have oversampling at home
# oversampling at home:
dataset_a = dwach+dwach+dwach+dwach
dataset_b = tolstoy
dataset_c = bibl

print(len(dataset_a))
print(len(dataset_b))
print(len(dataset_c))

vocab = Vocab(dataset_a + dataset_b + dataset_c, preprocess)

4392
4000
30733
least used token: ('худого', 19)
vocab size: 4998 (+2)


In [43]:
class StyleDataset(Dataset):
    def __init__(self, corpora, vocab):
        self.vocab = vocab
        self.samples = []
        self.labels = []
        
        for i, dataset in enumerate(corpora):
            self.samples += dataset
            self.labels += [i] * len(dataset)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        X = self.vocab.tokenize(self.samples[idx])
        y = [self.labels[idx]] * len(X)
        return torch.LongTensor(X), torch.LongTensor(y)

In [44]:
dataset = StyleDataset([dataset_a, dataset_b, dataset_c], vocab)

# todo: this should be done before oversampling
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [len(dataset)-512, 512])

train = DataLoader(train_dataset, batch_size=64, shuffle=True)
test = DataLoader(test_dataset, batch_size=64)

dataset[5]

(tensor([  13,  400, 1929,    1, 1540, 1688,   10,  829,   13, 1431, 1861,  862,
         3725,   15,   13, 1540,    0,    0,    0,    0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [45]:
def fetch_embeddings(embeddings, vocab, embedding_dim=300):
    weights = torch.randn(len(vocab.t2i), embedding_dim) / 10
    
    print('reading file', embeddings)
    print('this may take a while...')
    
    with open(embeddings) as file:
        for line in file:
            data = line.split()
            if len(data) == embedding_dim + 1:
                token = data[0]
                vector = torch.Tensor([float(x) for x in data[1:]])
                if token in vocab.t2i:
                    weights[vocab.t2i[token]] = vector
    
    return weights


class StyleClassifier(nn.Module):
    def __init__(
        self,
        embeddings,
        embedding_dim=300,
        num_classes=3,
        hidden_dim=50,
        num_layers=1,
        rnn_dropout=0,
        bidirectional=False):
        
        super().__init__()
        
        self.embed = nn.Embedding.from_pretrained(embeddings)
        
        self.rnn = nn.GRU(
            embedding_dim,
            hidden_dim, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            dropout=rnn_dropout,
            batch_first=True
        )
        
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_classes),
            nn.LogSoftmax(dim=2)
        )
        
    def forward(self, X):
        X = self.embed(X)
        X, _ = self.rnn(X)
        X = self.head(X)
        return X

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec

In [46]:
!touch empty.vec

In [47]:
embeddings = fetch_embeddings('wiki.ru.vec', vocab, embedding_dim=300)
model = StyleClassifier(embeddings)

reading file wiki.ru.vec
this may take a while...


In [48]:
test_batch = dataset[0][0].view(1, -1)
model(test_batch).exp()

tensor([[[0.3150, 0.3388, 0.3462],
         [0.2650, 0.3187, 0.4163],
         [0.2723, 0.3418, 0.3859],
         [0.2940, 0.3608, 0.3452],
         [0.2852, 0.3137, 0.4011],
         [0.3079, 0.3679, 0.3241],
         [0.2975, 0.2985, 0.4040],
         [0.2827, 0.3084, 0.4089],
         [0.3227, 0.3279, 0.3494],
         [0.2923, 0.3249, 0.3828],
         [0.2706, 0.2990, 0.4303],
         [0.3042, 0.3463, 0.3494],
         [0.2902, 0.3347, 0.3751],
         [0.3122, 0.2991, 0.3887],
         [0.3190, 0.3401, 0.3408],
         [0.2953, 0.3212, 0.3835],
         [0.2925, 0.3257, 0.3818],
         [0.2706, 0.3565, 0.3729],
         [0.2758, 0.3369, 0.3873],
         [0.2738, 0.3294, 0.3968]]], grad_fn=<ExpBackward>)

In [None]:
lr = 5e-2
num_epochs = 10
device = torch.device('cpu')

#model = StyleClassifier(len(vocab.i2t), 'fasttext.vec')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.NLLLoss()

In [None]:
train_losses = []
test_losses = []

for epoch in range(num_epochs):
    
    for X, y in tqdm(train):
        X.to(device)
        y.to(device)
        
        optimizer.zero_grad()
        
        preds = model(X)
        
        loss = criterion(preds.view(-1, 3), y.view(-1))
        
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
    
    test_loss = 0
    for X, y in tqdm(test):
        X.to(device)
        y.to(device)
        
        preds = model(X)
        loss = criterion(preds.view(-1, 3), y.view(-1))
        
        test_loss += loss.item()
    
    test_losses.append(test_loss)
    print('test loss:', test_loss)

100%|██████████| 604/604 [00:14<00:00, 42.49it/s]
100%|██████████| 8/8 [00:00<00:00, 102.40it/s]
  1%|          | 5/604 [00:00<00:14, 41.86it/s]

test loss: 3.1913817822933197


100%|██████████| 604/604 [00:14<00:00, 42.27it/s]
100%|██████████| 8/8 [00:00<00:00, 103.31it/s]
  1%|          | 5/604 [00:00<00:12, 48.16it/s]

test loss: 3.2933852672576904


100%|██████████| 604/604 [00:12<00:00, 49.20it/s]
100%|██████████| 8/8 [00:00<00:00, 115.95it/s]
  1%|          | 5/604 [00:00<00:12, 47.82it/s]

test loss: 3.126163125038147


100%|██████████| 604/604 [00:12<00:00, 48.49it/s]
100%|██████████| 8/8 [00:00<00:00, 117.16it/s]
  1%|          | 5/604 [00:00<00:12, 47.93it/s]

test loss: 3.1038631349802017


100%|██████████| 604/604 [00:14<00:00, 41.22it/s]
100%|██████████| 8/8 [00:00<00:00, 86.21it/s]
  1%|          | 4/604 [00:00<00:16, 36.74it/s]

test loss: 3.0791245698928833


100%|██████████| 604/604 [00:13<00:00, 46.44it/s]
100%|██████████| 8/8 [00:00<00:00, 113.21it/s]
  1%|          | 5/604 [00:00<00:12, 46.48it/s]

test loss: 3.259096711874008


100%|██████████| 604/604 [00:15<00:00, 40.09it/s]
100%|██████████| 8/8 [00:00<00:00, 86.79it/s]
  1%|          | 4/604 [00:00<00:16, 35.36it/s]

test loss: 3.1410126984119415


 77%|███████▋  | 468/604 [00:14<00:03, 35.55it/s]

In [None]:
plt.plot(train_losses)
plt.show()

plt.plot(test_losses)
plt.show()

In [None]:
from IPython.core.display import display, HTML

def print_colored(sequence, intensities, delimeter=' '):
    html = delimeter.join([
        # https://en.wikipedia.org/wiki/Subtractive_color
        f'<span style="background: rgb({255*(1-x[1]-x[2])}, {255*(1-x[0]-x[2])}, {255*(1-x[0]-x[1])})">{c}</span>'
        for c, x in zip(sequence, intensities) 
    ])
    display(HTML(html))

In [None]:
samples = [
    'Сап б, есть одна тян. Двачую однаапррвавапвпв анон',
    'князь болконский с удивлением увидел',
    'засмеялся проиграл тред не нашел создал',
    'хахаха ну ты даешь',
    'Сап двач, есть однаапррвавапвпв хэштег тян, она как будто со вниманием слушала рассказ князя Василья',
    ' '.join(dataset_a[111]), # Двач, должен быть красным
    ' '.join(dataset_b[111]), # bible, должна быть зелёной
    ' '.join(dataset_c[333]), # Tolstoy, должен быть синим
    ' '.join(dataset_a[6] + dataset_b[6] + dataset_c[6])
]

model = model.cpu()

t = 1 # температура, для лучшей визуализации

for sample in samples:
    sentence = preprocess(sample)
    X = torch.LongTensor(vocab.tokenize(sample, pad=False)).view(-1, 1)
    scores = model(X).view(-1, 3).mul(t).softmax(dim=1)
    maxes, _ = torch.max(scores, dim=1)
    scores -= ((1-maxes)/2).view(-1, 1) # так будет ровно один "полный" цвет
    #print(scores)
    print_colored(sentence, scores)