In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import numpy as np
from string import punctuation
from gensim.models import Word2Vec, KeyedVectors
from google.colab import drive
from tqdm.notebook import tqdm
from collections import Counter
import pandas as pd

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
class W2VDataset(Dataset):

    def __init__(self, dataset, word2id, DEVICE, features, text='Text'):
        self.dataset = dataset[text].values
        self.word2id = word2id
        self.features = features
        self.length = dataset.shape[0]
        self.target = torch.Tensor(dataset['Class'].values)
        self.device = DEVICE

    def __len__(self): #это обязательный метод, он должен уметь считать длину датасета
        return self.length

    def __getitem__(self, index): #еще один обязательный метод. По индексу возвращает элемент выборки
        tokens = self.preprocess(self.dataset[index]) # токенизируем
        ids = torch.LongTensor([self.word2id[token] for token in tokens if token in self.word2id])
        y = self.target[index]
        fs = self.features[index]
        texts = self.dataset[index]
        return ids, fs, y, texts
    
    def preprocess(self, text):
        tokens = text.lower().split()
        tokens = [token.strip(punctuation) for token in tokens]
        tokens = [token for token in tokens if token not in punctuation]
        return tokens

    def collate_fn(self, batch): #этот метод можно реализовывать и отдельно,
    # он понадобится для DataLoader во время итерации по батчам
      ids, fs, y, texts = list(zip(*batch))
      padded_ids = pad_sequence(ids, batch_first=True)
      return padded_ids, torch.Tensor(fs), torch.LongTensor(y), texts

In [None]:
#train_dataset = pd.read_csv('/content/gdrive/MyDrive/data/train.csv')
#val_dataset = pd.read_csv('/content/gdrive/MyDrive/data/val.csv')

In [None]:
train_dataset = pd.read_csv('/content/gdrive/MyDrive/train.csv')
val_dataset = pd.read_csv('/content/gdrive/MyDrive/val.csv')

In [None]:
train_dataset = train_dataset[train_dataset['Class'] != 'Human']
val_dataset = val_dataset[val_dataset['Class'] != 'Human']

In [None]:
def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens

In [None]:
word_vocab = Counter()
for i, text in train_dataset[:100].iterrows():
    word_vocab.update(preprocess(text[5]))

In [None]:
def create2id(vocab, threshold):
    filtered_vocab = set()
    for word in vocab:
        if vocab[word] > threshold:
            filtered_vocab.add(word)
    print(f'уникальных токенов, втретившихся больше {threshold} раз:', len(filtered_vocab))

    word2id = {'PAD':0}
    for word in filtered_vocab:
        word2id[word] = len(word2id)
    id2word = {i:word for word, i in word2id.items()}
    return word2id, id2word

In [None]:
word2id, id2word = create2id(word_vocab, 5)

уникальных токенов, втретившихся больше 5 раз: 51


In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
!gzip -d cc.ru.300.vec.gz

--2022-03-21 11:19:35--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1306357571 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ru.300.vec.gz’


2022-03-21 11:20:34 (21.4 MB/s) - ‘cc.ru.300.vec.gz’ saved [1306357571/1306357571]



In [None]:
import numpy as np

In [None]:
word2index = {'PAD': 0}
vectors = []
    
word2vec_file = open('cc.ru.300.vec')
    
n_words, embedding_dim = word2vec_file.readline().split()
n_words, embedding_dim = int(n_words), int(embedding_dim)

# Zero vector for PAD
vectors.append(np.zeros((1, embedding_dim)))

progress_bar = tqdm(desc='Read word2vec', total=n_words)

while True:

    line = word2vec_file.readline().strip()

    if not line:
        break
        
    current_parts = line.split()

    current_word = ' '.join(current_parts[:-embedding_dim])

    if current_word in word2id:

        word2index[current_word] = len(word2index)

        current_vectors = current_parts[-embedding_dim:]
        current_vectors = np.array(list(map(float, current_vectors)))
        current_vectors = np.expand_dims(current_vectors, 0)

        vectors.append(current_vectors)

    progress_bar.update(1)

progress_bar.close()

word2vec_file.close()

vectors = np.concatenate(vectors)

Read word2vec:   0%|          | 0/2000000 [00:00<?, ?it/s]

In [None]:
#vectors = KeyedVectors.load_word2vec_format('/content/gdrive/MyDrive/Копия model.bin', binary=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_dataset['Class'] = encoder.fit_transform(train_dataset['Class'])
val_dataset['Class'] = encoder.transform(val_dataset['Class'])

In [None]:
train_features = train_dataset[['len in symbols',
       'len in tokens', 'vowels / symbols', 
       'mean word length', 'long words / all words',
       'small words', 'Dale-Chall', 'Gunning-Fog', 'Flesch',
       'func_pos/all_words', 'cont_pos/all_words', 'noun_pos/all_words',
       'verb_pos/all_words', 'noun_pos/verb_pos']].to_numpy()
val_features = val_dataset[['len in symbols',
       'len in tokens', 'vowels / symbols', 
       'mean word length', 'long words / all words',
       'small words', 'Dale-Chall', 'Gunning-Fog', 'Flesch',
       'func_pos/all_words', 'cont_pos/all_words', 'noun_pos/all_words',
       'verb_pos/all_words', 'noun_pos/verb_pos']].to_numpy()

In [None]:
train_data = W2VDataset(train_dataset, word2id, 'cpu', train_features)
val_data = W2VDataset(val_dataset, word2id, 'cpu', val_features)
train_sampler = RandomSampler(train_data)
train_iterator = DataLoader(train_data, collate_fn = train_data.collate_fn, sampler=train_sampler, batch_size=64)
val_sampler = SequentialSampler(val_data)
val_iterator = DataLoader(val_data, collate_fn = val_data.collate_fn, sampler=val_sampler, batch_size=64)

In [None]:
class LModel(nn.Module):
    def __init__(self, num_classes, weights, vocab_size):
        super(LModel, self).__init__()
        self.lstm_size = 150
        self.embedding_dim = 300
        self.num_layers = 2
        self.embedding = nn.Embedding(vocab_size, 300)
        self.embedding.from_pretrained(torch.tensor(weights))
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            bidirectional=True,
        )
        self.h2h = nn.Linear(14, 300)
        self.relu = nn.ReLU()
        self.h2h_2 = nn.Linear(300, 200)
        self.dropout = nn.Dropout(p=0.5)
        self.i2o = nn.Linear(500, num_classes)

    def forward(self, x, features, prev_state=None):
        embed = self.embedding(x)
        if prev_state:
          output, state = self.lstm(embed, prev_state)
        else:
          output, state = self.lstm(embed)
        output = torch.mean(output, dim=1)
        features = self.relu(self.h2h(features))
        features = self.relu(self.dropout(self.h2h_2(features)))
        output = torch.cat((output, features), 1)
        logits = self.i2o(output)
        return logits, state

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def train(model, iterator, optimizer, criterion):
    print('Training...')
    epoch_loss = 0 # для подсчета среднего лосса на всех батчах
    model.train()  # ставим модель в обучение, явно указываем, что сейчас надо будет хранить градиенты у всех весов

    for i, (vectors, features, ys, texts) in enumerate(iterator): #итерируемся по батчам
        vectors = vectors.to('cuda:0')
        features = features.to('cuda:0')
        ys = ys.to('cuda:0')
        optimizer.zero_grad()  #обнуляем градиенты
        preds_proba = model(vectors, features) #прогоняем данные через модель
        loss = criterion(preds_proba[0], ys) #считаем значение функции потерь  
        loss.backward() #считаем градиенты  
        optimizer.step() #обновляем веса 
        epoch_loss += loss.item() #сохраняем значение функции потерь
        
    print(f'Train loss: {epoch_loss/i}')
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    print("\nValidating...")
    epoch_loss = 0
    accuracy = []
    model.eval() 
    pred = []
    label = []
    text = []
    with torch.no_grad():
        for i, (vectors, features, ys, texts) in enumerate(iterator):   
            vectors = vectors.to('cuda:0')
            features = features.to('cuda:0')
            ys = ys.to('cuda:0')
            predictions = model(vectors, features)  # делаем предсказания на тесте
            loss = criterion(predictions[0], ys)   # считаем значения функции ошибки для статистики  
            epoch_loss += loss.item() 
            accuracy.append(accuracy_score(ys.cpu().detach().numpy(), predictions[0].argmax(axis=1).cpu().detach().numpy()))
            text.extend(list(texts))
            pred.extend(predictions[0].argmax(axis=1).cpu().detach().tolist())
            label.extend(ys.cpu().detach().tolist())
    print(sum(accuracy)/len(accuracy))
    return epoch_loss / len(iterator), text, pred, label # возвращаем средний лосс по батчам

In [None]:
model = LModel(num_classes=13, weights=vectors, vocab_size=len(word2id))
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss() # Binary Cross Entropy

# веса модели и значения лосса храним там же, где и все остальные тензоры
model = model.to('cuda:0')
criterion = criterion.to('cuda:0')

In [None]:
losses = []
losses_eval = []

for i in tqdm(range(10)):
    print(f'\nstarting Epoch {i}')
    epoch_loss = train(model, train_iterator, optimizer, criterion)
    losses.append(epoch_loss)

    epoch_loss_on_test = evaluate(model, val_iterator, criterion)
    losses_eval.append(epoch_loss_on_test)

  0%|          | 0/10 [00:00<?, ?it/s]


starting Epoch 0
Training...
Train loss: 2.051010639822277

Validating...
0.2449147324037739

starting Epoch 1
Training...
Train loss: 1.9432315263183957

Validating...
0.26168776613509226

starting Epoch 2
Training...
Train loss: 1.9108337753604647

Validating...
0.26670389079068213

starting Epoch 3
Training...
Train loss: 1.8973126799333457

Validating...
0.275748570176171

starting Epoch 4
Training...
Train loss: 1.8788344352678286

Validating...
0.27973146447357433

starting Epoch 5
Training...
Train loss: 1.8636385839104945

Validating...
0.28227801202304414

starting Epoch 6
Training...
Train loss: 1.8479790724783327

Validating...
0.2846001711613927

starting Epoch 7
Training...
Train loss: 1.8383729260968888

Validating...
0.28705670242965686

starting Epoch 8
Training...
Train loss: 1.831036548587075

Validating...
0.2882882295232529

starting Epoch 9
Training...
Train loss: 1.8255473342995812

Validating...
0.29197628788511315


In [None]:
loss, texts, predictions, labels = evaluate(model, val_iterator, criterion)
df = pd.DataFrame({'texts':texts, 'predictions':encoder.inverse_transform(predictions),
                   'labels':encoder.inverse_transform(labels)})
df.to_csv('LSTM_Word2Vec.csv')


Validating...
0.29197628788511315
