In [2]:
! pip install datasets
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

from datasets import load_dataset
from nltk.tokenize import ToktokTokenizer
from sklearn.metrics import f1_score
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


In [47]:
from IPython.core.display import TextDisplayObject


In [18]:
from sklearn.metrics import f1_score


In [3]:
!wget  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip  "wiki-news-300d-1M.vec.zip" -d "."

--2022-12-28 22:33:15--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2022-12-28 22:33:34 (35.6 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: ./wiki-news-300d-1M.vec  


In [4]:
def load_embeddings(file_path, pad_token='PAD', unk_token='UNK', max_words=100_000, verbose=True):
    
    vocab = dict()
    embeddings = list()

    with open(file_path) as file_object:

        vocab_size, embedding_dim = file_object.readline().strip().split()

        vocab_size = int(vocab_size)
        embedding_dim = int(embedding_dim)

        # в файле 1 000 000 слов с векторами, давайте ограничим для простоты этот словарь
        max_words = vocab_size if max_words <= 0 else max_words
        vocab: Dict[str, int] = dict()
        # добавим пад токен и эмбеддинг в нашу матрицу эмбеддингов и словарь
        vocab[pad_token] = 0
        # добавим унк токен и эмбеддинг в нашу матрицу эмбеддингов и словарь
        vocab[unk_token] = 1
        
        embeddings = np.zeros((max_words, embedding_dim))
        embeddings[1,:] = (np.ones(embedding_dim))

        progress_bar = tqdm(total=max_words, disable=not verbose, desc='Reading embeddings file')

        for line in file_object:
            parts = line.strip().split()

            token = ' '.join(parts[:-embedding_dim]).lower()

            if token in vocab:
                continue

            vocab[token] = len(vocab)

            embeddings[vocab[token], :] = np.array(list(map(float, parts[-embedding_dim:])))

            progress_bar.update()

            if len(vocab) == max_words:
                break

        progress_bar.close()

    assert(len(vocab) == embeddings.shape[0])

    #embeddings = np.stack(embeddings)
    
    return vocab, embeddings

In [5]:
vocab, embeddings = load_embeddings('wiki-news-300d-1M.vec', max_words=100_000)

Reading embeddings file: 100%|█████████▉| 99998/100000 [00:08<00:00, 11246.45it/s]


In [6]:
dataset_path = "tweet_eval"
dataset_name = "sentiment"

train_dataset = load_dataset(path=dataset_path, name=dataset_name, split="train")
valid_dataset = load_dataset(path=dataset_path, name=dataset_name, split="validation")
test_dataset = load_dataset(path=dataset_path, name=dataset_name, split="test")

Downloading builder script:   0%|          | 0.00/9.72k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/sentiment to /root/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/527k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/99.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/629 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.




In [7]:
def empty_collate(batch):
    return batch

In [8]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=empty_collate)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=True, collate_fn=empty_collate)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True, collate_fn=empty_collate)

In [9]:
class Tokenizer:
    
    def __init__(self, base_tokenizer, token2index, pad_token, unk_token, max_length):
        
        self._base_tokenizer = base_tokenizer  
        self.token2index = token2index 
        self.pad_token = pad_token
        self.pad_index = self.token2index[self.pad_token]    
        self.unk_token = unk_token
        self.unk_index = self.token2index[self.unk_token]
        
        self.max_length = max_length

    def tokenize(self, text):
        """
        В этом методе нужно разделить строку текста на токены
        """
        return self._base_tokenizer.tokenize(text)
    
    def indexing(self, tokenized_text):
        """
        В этом методе нужно перевести список токенов в список с индексами этих токенов
        """
        tokens_indices = []
        for t in tokenized_text:
          if t in self.token2index:
            tokens_indices.append(self.token2index[t])
          else:
            tokens_indices.append(self.unk_index)

        return tokens_indices
        
    def padding(self, tokens_indices):
        """
        В этом методе нужно сделать длину tokens_indices равной self.max_length
        Опционально убрать повторяющиеся unk'и
        """
        if len(tokens_indices) > self.max_length:
          return tokens_indices[:self.max_length]
        elif len(tokens_indices) < self.max_length:
          return tokens_indices + [self.pad_index for i in range(self.max_length - len(tokens_indices))]
        
    
    def __call__(self, text):
        """
        В этом методе нужно перевести строку с текстом в вектор с индексами слов нужно размера (self.max_length)
        """
        return self.padding(self.indexing(self.tokenize(text)))
        
    def collate(self, batch):
        
        tokenized_texts = list()
        labels = list()
        
        for sample in batch:
            tokenized_texts.append(Tokenizer(base_tokenizer=self._base_tokenizer, token2index=self.token2index, pad_token=self.pad_token, unk_token=self.unk_token, max_length=self.max_length)(sample['text']))
            labels.append(sample['label'])
            
        tokenized_texts = torch.Tensor(tokenized_texts).to(torch.int64)  # перевод в torch.Tensor
        labels = torch.Tensor(labels).to(torch.int64)  # перевод в torch.Tensor
        

        return tokenized_texts, labels

In [10]:
tokenizer = Tokenizer(base_tokenizer=ToktokTokenizer(), token2index=vocab, unk_token='UNK', pad_token='PAD', max_length=64)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=tokenizer.collate)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False, collate_fn=tokenizer.collate)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=tokenizer.collate)


In [12]:
for x, y in train_loader:
    break

In [44]:
class DeepAverageNetwork(nn.Module):
    
    def __init__(self,
                 n_embed: int,
                 d_embed: int,
                 d_hidden: int,
                 d_out: int,
                 dp: float,
                 embed_weight:np.array):
        super(DeepAverageNetwork, self).__init__()


        self.embed = nn.Embedding(n_embed, d_embed)

 
        self.dropout1 = nn.Dropout(dp)

        self.bn1 = nn.BatchNorm1d(d_embed)

        self.fc1 = nn.Linear(d_embed, d_hidden)

        self.non_linear_function = torch.nn.ReLU()




        self.dropout2 = nn.Dropout(dp)

        self.bn2 = nn.BatchNorm1d(d_hidden)

        self.fc2 = nn.Linear(d_hidden, d_out)

        self.non_linear_function = torch.nn.ReLU()




        self.dropout3 = nn.Dropout(dp)

        self.bn2 = nn.BatchNorm1d(d_hidden)


    def forward(self, x):

        x = self.embed(x)

        x = x.mean(dim=1)

        x = self.dropout1(x)

        x = self.bn1(x)

        x = self.fc1(x)
        
        x = self.non_linear_function(x)


        x = self.dropout2(x)

        x = self.bn2(x)

        x = self.fc2(x)

        return x

In [45]:
model = DeepAverageNetwork(n_embed=100_000, 
                           d_embed=len(embeddings[0]),
                           embed_weight= embeddings,
                           d_hidden=128, 
                           d_out=3, 
                           dp=0.22, 
                           )

In [28]:
def train(model, iterator, optimizer, criterion):
    
    losses = list()
    f1 = list()
    
    model.train()  

    for x, y in tqdm(iterator, total=len(iterator)):
        
        optimizer.zero_grad()   
        predictions = model(x.to(device)).squeeze() 
        loss = criterion(predictions, y.to(device))        
               
        
        loss.backward()
        optimizer.step()     
        preds = predictions.detach().cpu().numpy().argmax(1).tolist()
        y_true = y.tolist()
        
        losses.append(loss.item())
        f1.append(f1_score(y_true, preds, average="weighted"))
        
    return np.mean(f1), np.mean(losses)

def evaluate(model, iterator, criterion):
    
    losses = list()
    f1 = list()
    
    model.eval()  
    with torch.no_grad():
        for x, y in tqdm(iterator, total=len(iterator)):   

            predictions = model(x.to(device)).squeeze()  
            loss = criterion(predictions, y.to(device))        
            preds = predictions.detach().cpu().numpy().argmax(1).tolist()
            y_true = y.tolist()
            

            losses.append(loss.item())  
            f1.append(f1_score(y_true, preds, average="weighted"))
            
    return np.mean(f1), np.mean(losses) 

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
Epochs = 10

losses = list()
loss_on_eval = list()

f1 = list()
f1_on_eval = list()

for n_epoch in range(Epochs):
    print(n_epoch)
    f1_train, loss_train = train(model=model, iterator=train_loader, optimizer=optimizer, criterion=criterion)
    f1.append(f1_train)
    losses.append(loss_train)
    f1_valid, loss_valid = evaluate(model=model, iterator=valid_loader, criterion=criterion)
    f1_on_eval.append(f1_valid)
    loss_on_eval.append(loss_valid)

# Loss_on_train:1.2103326778118015, F1_train:0.3433361661137654,
# Loss_on_valid:1.125163808465004, F1_valid: 0.30638698333971887

# **Результат довольно неплоохй. Метрика f1 weighted подошла хорошо, так как она нужна, если существует дисбаланс классов. Это как раз наш случай.**