In [1]:
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /home/allworder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import torch

Будем использовать NDCG метрику для оценки качества модели

In [4]:
from sklearn.metrics import ndcg_score

Меткрики близости веторов:

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
import vector_space_models_utils
from vector_space_models_utils import rank_comments

# Word2vec and GloVe pretrained models vectorization

В данном разделе для перевода текстов комментариев и поста в вектора мы будем использовать уже предобученные embedding Word2vec или GloVe.

In [7]:
import gensim.downloader

Загрузим модель:

In [8]:
glove_embeddings = gensim.downloader.load('glove-wiki-gigaword-50')
glove_vectors_dim = 50

Создадим токенизатор с небольшой предобработкой текста:

In [9]:
class NLTK_Tokenizer:
    def __init__(self):
        pass
    def tokenize(self, text):
        tokens = word_tokenize(text.lower())
        return tokens
    
nltk_tokenizer = NLTK_Tokenizer()

Создадим экземпляр класса Embeddings_text_to_vector с описанными ранее embedding и tokenizer:

In [10]:
from vector_space_models_utils import Embeddings_text_to_vector

In [11]:
glove_wiki_50_vectorizer = Embeddings_text_to_vector(glove_embeddings, nltk_tokenizer, glove_vectors_dim)

In [8]:
# ВРЕМЕННАЯ ЗАГЛУШКА
train_df = pd.read_csv('ranking_train.csv')
post_text = str(train_df['text'][0])
comments_text = list(train_df['comments_text'][:5])

In [13]:
predicted_ranks = [rank_comments(post_text, comments_text, glove_wiki_50_vectorizer)]
print(f'predicted ranks: {predicted_ranks}')
true_ranks = [[0, 1, 2, 3, 4]]

print(f'NDCG@5: {ndcg_score(true_ranks, predicted_ranks, k=5)}')

predicted ranks: [[0, 3, 1, 2, 4]]
NDCG@5: 0.9547778391728274


# Tf-IDF vectorization

В данном разделе мы будем использовать Tf-IDF вектора для векторизации текста

В начале загрузим корпус документов, состоящий из текстов и постов, и комментариев:

In [8]:
# В ЯЧЕЙКЕ ВРЕМЕННО ЗАГЛУШКА СТОИТ!
train_df = pd.read_csv('ranking_train.csv')
corpus = train_df['comments_text'][:1000]

Препроцессинг документов в корпусе:

In [14]:
# ЗДЕСЬ НУЖНО ПРОВЕСТИ ЛЕММАТИЗАЦИЮ, УДАЛЕНИЕ МУСОРНЫХ ТОКЕНОВ
class NLTK_Tokenizer:
    def __init__(self):
        pass
    def tokenize(self, text):
        tokens = word_tokenize(text.lower())
        return tokens
    
nltk_tokenizer = NLTK_Tokenizer()

Далее воспользуемся библиотекой sklearn для подсчёта IDF слов в корпусе документов:

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tf_idf_vectorizer = TfidfVectorizer(use_idf=True)
tf_idf_vectorizer.fit(list(corpus))
idf_dict = dict(zip(tf_idf_vectorizer.get_feature_names_out(), tf_idf_vectorizer.idf_))

Создадим экземпляр класса Tf_idf_vectorizer:

In [13]:
from vector_space_models_utils import Tf_idf_vectorizer

In [15]:
our_tf_idf_vectorizer = Tf_idf_vectorizer(idf_dict, nltk_tokenizer)

Оценим качество модели:

In [18]:
predicted_ranks = [rank_comments(post_text, comments_text, our_tf_idf_vectorizer)]
print(f'predicted ranks: {predicted_ranks}')
true_ranks = [[0, 1, 2, 3, 4]]

print(f'NDCG@5: {ndcg_score(true_ranks, predicted_ranks, k=5)}')

predicted ranks: [[4, 0, 2, 3, 1]]
NDCG@5: 0.6830575063802934


# BERT model

В данном разделе будем получать вектор предложения из латентных слоёв модели BERT:

In [10]:
from transformers import BertTokenizer, BertModel

In [12]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert_model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

Создадим экземпляр класса BERT_vectorizer:

In [13]:
from vector_space_models_utils import BERT_vectorizer

In [14]:
bert_vectorizer = BERT_vectorizer(bert_model, bert_tokenizer)

Оценим качество модели:

In [18]:
predicted_ranks = [rank_comments(post_text, comments_text, bert_vectorizer)]
print(f'predicted ranks: {predicted_ranks}')
true_ranks = [[0, 1, 2, 3, 4]]

print(f'NDCG@5: {ndcg_score(true_ranks, predicted_ranks, k=5)}')

predicted ranks: [[2, 4, 0, 3, 1]]
NDCG@5: 0.7358812399521907
