Данные

In [0]:
!pip install pymorphy2

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
import json, os
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))

In [4]:
from google.colab import files
uploaded = files.upload()

Saving ru_kw_eval_datasets-master.zip to ru_kw_eval_datasets-master.zip


In [0]:
!unzip ru_kw_eval_datasets-master.zip

In [0]:
pd.set_option('display.max_colwidth', 1000)

In [0]:
PATH_TO_DATA = 'ru_kw_eval_datasets-master/data/'

In [0]:
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA)]

In [0]:
data = pd.concat([pd.read_json(file, lines=True) for file in files][:5], axis=0, ignore_index=True)

In [11]:
data.shape

(4991, 5)

In [0]:
data.head(5)

In [0]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))

In [0]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize1(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return words

def normalize2(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]
    words = [morph.parse(word)[0].normal_form for word in words if 'NOUN' in morph.parse(word)[0].tag] #только сущ

    return words
    
def normalize3(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]
    words = [morph.parse(word)[0].normal_form for word in words if 'NOUN' in morph.parse(word)[0].tag or 'ADJF' in morph.parse(word)[0].tag] #только сущ и прил

    return words

In [15]:
#проверка
evaluate(data['keywords'], data['keywords'])

Precision -  1.0
Recall -  1.0
F1 -  1.0
Jaccard -  1.0


Подготовка данных

In [0]:
data['content_norm_base'] = data['content'].apply(normalize1)

In [0]:
data['content_norm_base_str'] = data['content_norm_base'].apply(' '.join)

In [0]:
data['content_norm'] = data['content'].apply(normalize2)

In [0]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [0]:
data['title_norm'] = data['title'].apply(normalize2)

Tfidf-baseline

In [0]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5)

In [0]:
tfidf.fit(data['content_norm_base_str'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
id2word = {i:word for i,word in enumerate(tfidf.get_feature_names())}
texts_vectors = tfidf.transform(data['content_norm_base_str'])

In [0]:

keywords_base = []

for row in range(texts_vectors.shape[0]):
    row_data = texts_vectors.getrow(row)
    top_inds = row_data.toarray().argsort()[0,:-11:-1]
    keywords_base.append([id2word[w] for w in top_inds])

In [0]:
#здесь бэйзлайн
evaluate(data['keywords'], keywords_base)

Precision -  0.1
Recall -  0.13
F1 -  0.11
Jaccard -  0.06


Некоторые улучшения

In [0]:
#самые частотные с нормализацией
evaluate(data['keywords'], data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(10)]))

Precision -  0.11
Recall -  0.13
F1 -  0.12
Jaccard -  0.07


Самые частотные слова из текста после нормализации (приведение к нижнему регистру, удаление стоп-слов и всех частей речи, кроме существительных). Есть небольшой прирост.  

In [67]:
evaluate(data['keywords'], data['title_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(10)]))

Precision -  0.14
Recall -  0.09
F1 -  0.11
Jaccard -  0.06


Если извлекать частотные слова из заголовков, то падает полнота (что, в принципе, понятно), зато растёт точность. 

Графы

In [0]:
import networkx as nx

In [0]:
def build_matrix(text, window_size=5):
    vocab = set(text)
    word2id = {w:i for i, w in enumerate(vocab)}
    id2word = {i:w for i, w in enumerate(vocab)}
    # преобразуем слова в индексы для удобства
    ids = [word2id[word] for word in text]

    # создадим матрицу совстречаемости
    m = np.zeros((len(vocab), len(vocab)))

    # пройдемся окном по всему тексту
    for i in range(0, len(ids), window_size):
        window = ids[i:i+window_size]
        # добавим единичку всем парам слов в этом окне
        for j, k in combinations(window, 2):
            # чтобы граф был ненаправленный 
            m[j][k] += 1
            m[k][j] += 1
    
    return m, id2word

def some_centrality_measure(text, window_size=5, topn=5):
    
    matrix, id2word = build_matrix(text, window_size)
    G = nx.from_numpy_array(matrix)
    # тут можно поставить любую метрику
    # менять тут 
    node2measure = dict(nx.degree_centrality(G))
    
    return [id2word[index] for index,measure in sorted(node2measure.items(), key=lambda x: -x[1])[:topn]]

In [89]:
%%time
keyword_nx = data['content_norm'].apply(lambda x: some_centrality_measure(x, 10, 10))

CPU times: user 46.3 s, sys: 31 ms, total: 46.3 s
Wall time: 46.4 s


In [90]:
evaluate(data['keywords'], keyword_nx)


Precision -  0.11
Recall -  0.13
F1 -  0.11
Jaccard -  0.07


С графом на нетворкx ещё прирост. Пробовал другие метрики (closeness_centrality), но исходная оказалась лучше.

RAKE

In [54]:
!pip install rake_nltk



In [0]:
import rake_nltk

In [0]:
r = rake.Rake(stops)


In [0]:
keywords_rk = []
for text in data['content_norm_str']: #можно попробовать со стеммером
  keywords = r.extract_keywords_from_text(text)
  keywords_rk.append(r.get_ranked_phrases())

In [76]:
evaluate(data['keywords'], keywords_rk)

Precision -  0.01
Recall -  0.0
F1 -  0.0
Jaccard -  0.0


С рэйком что-то совсем не вышло. 