In [1]:
from datasets import load_dataset
import nltk
from tqdm import tqdm
import re
import spacy
from spacy.tokenizer import _get_regex_pattern

  import pynvml  # type: ignore[import]


Создание датасета из файлов папки SemEval2010

In [19]:
import os
import glob
import re
from datasets import Dataset

def create_dataset(text_dir, keyphrases_dir):
    text_files = sorted(glob.glob(os.path.join(text_dir, "*.txt")))
    data = []
    for text_file in text_files:
        base_name = os.path.splitext(os.path.basename(text_file))[0]
        key_file = os.path.join(keyphrases_dir, f"{base_name}.key")
        with open(text_file, 'r', encoding='utf-8') as f:
            text = f.read()
        text = re.sub(r'[\n\r\t]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        with open(key_file, 'r', encoding='utf-8') as f:
            keyphrases = []
            for line in f:
                line = line.strip()
                if line:
                    cleaned_line = re.sub(r'[\n\r\t]+', ' ', line)
                    cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()
                    if cleaned_line:
                        keyphrases.append(cleaned_line)     
        data.append({
            'id': base_name,
            'text': text,
            'keyphrases': keyphrases
        })
    dataset = Dataset.from_list(data)
    return dataset

dataset = create_dataset(r".\SemEval2010\docsutf8", r".\SemEval2010\keys")

In [20]:
dataset['text'][0]

'Scalable Grid Service Discovery Based on UDDI* * Authors are listed in alphabetical order. Sujata Banerjee$ , Sujoy Basu$ , Shishir Garg , Sukesh Garg , Sung-Ju Lee$ , Pramila Mullan , Puneet Sharma$ $ HP Labs 1501 Page Mill Road Palo Alto, CA, 94304 USA +1-650-857-2137 {sujata.banerjee,sujoy.basu,sungju.lee,puneet.sharma}@hp.com France Telecom R&D Division 801 Gateway Blvd, # 500 South San Francisco, CA, 94080 USA +1 650 -875-1500 {shishir.garg,sukesh.garg,pramila.mullan}@francetelecom.com ABSTRACT Efficient discovery of grid services is essential for the success of grid computing. The standardization of grids based on web services has resulted in the need for scalable web service discovery mechanisms to be deployed in grids Even though UDDI has been the de facto industry standard for web-services discovery, imposed requirements of tight-replication among registries and lack of autonomous control has severely hindered its widespread deployment and usage. With the advent of grid compu

Импортирование и редактирование языковой модели под слова с дефисами

In [4]:
nlp = spacy.load("en_core_web_sm")
re_token_match = _get_regex_pattern(nlp.Defaults.token_match)
re_token_match = f"({re_token_match}|\w+-\w+)"
nlp.tokenizer.token_match = re.compile(re_token_match).match

  re_token_match = f"({re_token_match}|\w+-\w+)"


Обработка данных: стеммизация ключевых слов. Текст на лишние символы не обрабатывался

In [5]:
def preprocessing(dataset, nlp):
    keyphrases = []
    text = []
    for sample in tqdm(dataset):
        sample_keyphrases = []
        text.append(sample['text'])
        for keyphrase in sample["keyphrases"]:
            tokens = [token.text for token in nlp(keyphrase)]
            stemmed = [nltk.stem.snowball.SnowballStemmer('porter').stem(token.lower()) for token in tokens]
            sample_keyphrases.append(" ".join(stemmed))
        keyphrases.append(sample_keyphrases)
    return text, keyphrases

In [6]:
text, references = preprocessing(dataset, nlp)

  0%|          | 0/243 [00:00<?, ?it/s]

100%|██████████| 243/243 [00:23<00:00, 10.18it/s]


In [7]:
def extraction(model, text, references, stemmer, vectorizer=None):
    output = []
    for doc, reference in tqdm(zip(text, references)):
        n = len(reference)
        match model.__class__.__name__:
            case 'KeyBERT':
                keywords = model.extract_keywords(
                    doc,
                    keyphrase_ngram_range=(1, 3),
                    vectorizer=vectorizer,
                    use_mmr=True,
                    diversity=0.7,
                    top_n=n * 2)
                stemmed = []
                for kw, score in keywords:
                    words = kw.split()
                    stemmed_words = [stemmer.stem(word.lower()) for word in words]
                    stemmed_phrase = " ".join(stemmed_words)    
                    stemmed.append(stemmed_phrase)
                output.append(stemmed[:n])
            case 'Rake':
                model.extract_keywords_from_text(doc)
                keywords = model.get_ranked_phrases()
                stemmed = []
                for kw in keywords:
                    words = kw.split()
                    stemmed_words = [stemmer.stem(word.lower()) for word in words]
                    stemmed_phrase = " ".join(stemmed_words)    
                    stemmed.append(stemmed_phrase)
                output.append(stemmed[:n])
            case _:
                model.load_document(input=doc)
                model.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
                model.candidate_weighting()
                output.append([keyword for keyword, rank in model.get_n_best(n=n, stemming=True)])
    return output

Извлечение слов. Максимальная длина n-грама 3. KeyBERT использует пользовательский векторизатор, учитывающий грамматическую связь слов

In [8]:
from pke.unsupervised import TextRank, TopicRank, YAKE
from CustomVectorizer import CustomVectorizer
from nltk.corpus import stopwords
from keybert import KeyBERT
from rake_nltk import Rake
nltk.download('stopwords')
nltk.download('punkt')
stemmer = stemmer = nltk.stem.snowball.SnowballStemmer('porter')
nlp = spacy.load("en_core_web_sm")
vectorizer = CustomVectorizer(
    nlp_model=nlp,
    ngram_range=(1, 3),
    stop_words='english')
outputs = {}
for model in [KeyBERT, Rake, TextRank, TopicRank, YAKE]:
    extractor = model()
    if extractor.__class__.__name__ == 'Rake':
        extractor = model(stopwords=stopwords.words('english'), language='english', max_length=3)
    outputs[extractor.__class__.__name__] = extraction(extractor, text, references, stemmer, vectorizer=vectorizer)
outputs




[nltk_data] Downloading package stopwords to C:\Users\Maxim
[nltk_data]     Zubarev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Maxim
[nltk_data]     Zubarev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
243it [20:27,  5.05s/it]
243it [00:20, 11.89it/s]
243it [07:25,  1.83s/it]
243it [22:10,  5.47s/it]
243it [07:38,  1.89s/it]


{'KeyBERT': [['grid servic discoveri',
   'sap',
   'ieee intern srd',
   'web servic permiss',
   'iptp',
   'uddi deploy',
   'simpl replic',
   'other distant node',
   'qo framework',
   'xmln',
   'ogsi wg',
   'franc telecom',
   'http://www.planet-lab.org',
   'acm middlewar',
   'comput repair',
   'v2 specif',
   'new local registri',
   'scale issu',
   'hash tabl data'],
  ['sensor deploy strategi',
   'network target detect',
   'path exposur',
   'drive madison',
   'data fusion',
   'alarm',
   'dijkstra"',
   'consensu decis',
   'awgn nois',
   'mobil radio servic',
   'cost function',
   'virtual point a',
   'larger region',
   '≤ n'],
  ['voip audio conferenc',
   'activ client',
   'address alloc architectur',
   'media handl solut',
   'variou confer size',
   'scalabl framework',
   'transport protocol',
   'network delay',
   'system mix',
   'session descript',
   'multi parti',
   'ieee itr',
   'realiti',
   'bangalor',
   'loud number',
   'top nmax stream'],

In [9]:
import pickle

with open('outputs_semeval2010.pkl', 'wb') as f:
    pickle.dump(outputs, f)

In [12]:
import pickle

with open('outputs_semeval2010.pkl', 'rb') as f:
    outputs = pickle.load(f)

Оценка моделей F1-метрикой

In [13]:
def evaluation(predictions, references):
    precision = len(set(predictions) & set(references)) / len(set(predictions))
    recall = len(set(predictions) & set(references)) / len(set(references))
    F1_metrics = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return F1_metrics

Плохой результат из-за недостаточной обработки текста

In [14]:
import numpy as np
evals = {}
for model in outputs:
    scores = []
    for output, reference in zip(outputs[model], references):
        scores.append(evaluation(output, reference))
    avg_score = np.mean(scores, axis = 0)
    evals[model] = round(avg_score, 2)
evals

{'KeyBERT': np.float64(0.03),
 'Rake': np.float64(0.01),
 'TextRank': np.float64(0.02),
 'TopicRank': np.float64(0.13),
 'YAKE': np.float64(0.02)}

In [18]:
outputs['YAKE'][233]

['tclose f − ts c ≤ lc ≤ vc',
 'ˆdi ≥ ˆli = ⇒ ei',
 '≥ li proof',
 'i ∈ avail',
 '∀ ˆli ≥ li',
 'v1 ≥ √ k',
 '+ lf = df',
 '− t − li',
 'df = df +',
 '+ δ = k',
 'time t ∈',
 'least li',
 'vi li',
 'θ1 ×',
 '∈ o',
 'ri ≤ t.',
 'k ≥',
 'te =']