In [None]:
#libraries installation:

In [None]:
#From RAKE-nltk page:

In [None]:
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()

# Extraction given the text.
r.extract_keywords_from_text(<text to process>)

# Extraction given the list of strings where each string is a sentence.
r.extract_keywords_from_sentences(<list of sentences>)

# To get keyword phrases ranked highest to lowest.
r.get_ranked_phrases()

# To get keyword phrases ranked highest to lowest with scores.
r.get_ranked_phrases_with_scores()

In [None]:
# From vc.ru:

In [None]:
import nltk
from ntlk.corpus import stopwords
nltk.download("stopwords")
stops = list(set(stopwords.words("russian")))

r = Rake(stopwords = stops, max words = 3)
r.apply(text)

In [None]:
# From towardsdatascience:

In [None]:
def rake_extractor(text):
    """
    Uses Rake to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[:5]

In [None]:
# Spacy match function

In [None]:
def match(keyword):
    """This function checks if a list of keywords match a certain POS pattern"""
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'VERB'}, {'POS': 'VERB'}],
        [{'POS': 'NOUN'}, {'POS': 'VERB'}, {'POS': 'NOUN'}],
        [{'POS': 'VERB'}, {'POS': 'NOUN'}],
        [{'POS': 'ADJ'}, {'POS': 'ADJ'}, {'POS': 'NOUN'}],  
        [{'POS': 'NOUN'}, {'POS': 'VERB'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'NOUN'}],
        [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
        [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'ADV'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'VERB'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],
        [{'POS': 'NOUN'}, {'POS': 'NOUN'}],
        [{'POS': 'ADJ'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'ADP'}, {'POS': 'PROPN'}],
        [{'POS': 'PROPN'}, {'POS': 'ADJ'}, {'POS': 'NOUN'}],
        [{'POS': 'PROPN'}, {'POS': 'VERB'}, {'POS': 'NOUN'}],
        [{'POS': 'NOUN'}, {'POS': 'ADP'}, {'POS': 'NOUN'}],
        [{'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}],
        [{'POS': 'VERB'}, {'POS': 'ADV'}],
        [{'POS': 'PROPN'}, {'POS': 'NOUN'}],
        ]
    matcher = Matcher(nlp.vocab)
    matcher.add("pos-matcher", patterns)
    # create spacy object
    doc = nlp(keyword)
    # iterate through the matches
    matches = matcher(doc)
    # if matches is not empty, it means that it has found at least a match
    if len(matches) > 0:
        return True
    return False

In [None]:
# Extractor function

In [None]:
def extract_keywords_from_corpus(extractor, corpus):
    """This function uses an extractor to retrieve keywords from a list of documents"""
    extractor_name = extractor.__name__.replace("_extractor", "")
    logging.info(f"Starting keyword extraction with {extractor_name}")
    corpus_kws = {}
    start = time.time()
    # logging.info(f"Timer initiated.") <-- uncomment this if you want to output start of timer
    for idx, text in tqdm(enumerate(corpus), desc="Extracting keywords from corpus..."):
        corpus_kws[idx] = extractor(text)
    end = time.time()
    # logging.info(f"Timer stopped.") <-- uncomment this if you want to output end of timer
    elapsed = time.strftime("%H:%M:%S", time.gmtime(end - start))
    logging.info(f"Time elapsed: {elapsed}")
    
    return {"algorithm": extractor.__name__, 
            "corpus_kws": corpus_kws, 
            "elapsed_time": elapsed}

In [None]:
    # create results dataframe
    df = pd.DataFrame(results)
    df.to_csv("results.csv", index=False)
    logging.info("Benchmark finished. Results saved to results.csv")
    return df

In [None]:
# Timer function

In [None]:
def benchmark(corpus, shuffle=True):
    """This function runs the benchmark for the keyword extraction algorithms"""
    logging.info("Starting benchmark...\n")
    
    # Shuffle the corpus
    if shuffle:
        random.shuffle(corpus)

    # extract keywords from corpus
    results = []
    extractors = [
        rake_extractor, 
        yake_extractor, 
        topic_rank_extractor, 
        position_rank_extractor,
        single_rank_extractor,
        multipartite_rank_extractor,
        keybert_extractor,
    ]
    for extractor in extractors:
        result = extract_keywords_from_corpus(extractor, corpus)
        results.append(result)

    # compute average number of extracted keywords
    for result in results:
        len_of_kw_list = []
        for kws in result["corpus_kws"].values():
            len_of_kw_list.append(len(kws))
        result["avg_keywords_per_document"] = np.mean(len_of_kw_list)

    # match keywords
    for result in results:
        for idx, kws in result["corpus_kws"].items():
            match_results = []
            for kw in kws:
                match_results.append(match(kw))
                result["corpus_kws"][idx] = match_results

    # compute average number of matched keywords
    for result in results:
        len_of_matching_kws_list = []
        for idx, kws in result["corpus_kws"].items():
            len_of_matching_kws_list.append(len([kw for kw in kws if kw]))
        result["avg_matched_keywords_per_document"] = np.mean(len_of_matching_kws_list)
        # compute average percentange of matching keywords, round 2 decimals
        result["avg_percentage_matched_keywords"] = round(result["avg_matched_keywords_per_document"] / result["avg_keywords_per_document"], 2)
        
    # create score based on the avg percentage of matched keywords divided by time elapsed (in seconds)
    for result in results:
        elapsed_seconds = get_sec(result["elapsed_time"]) + 0.1
        # weigh the score based on the time elapsed
        result["performance_score"] = round(result["avg_matched_keywords_per_document"] / elapsed_seconds, 2)
    
    # delete corpus_kw
    for result in results:
        del result["corpus_kws"]

    # create results dataframe
    df = pd.DataFrame(results)
    df.to_csv("results.csv", index=False)
    logging.info("Benchmark finished. Results saved to results.csv")
    return df

In [None]:
# Navec

In [9]:
#wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

SyntaxError: invalid syntax (454629916.py, line 1)

In [4]:
from navec import Navec
path = 'data/navec_hudlit_v1_12B_500K_300d_100q.tar'
#path = 'data/navec_hudlit_v1_12B_500K_300d_100q.tar' uncomment if wget is used
navec = Navec.load(path)