### Preparation

In [None]:
prepare = False
if prepare == True:
    !pip install pytextrank
    !python -m spacy download pl_core_news_sm #or lg

In [None]:
import pandas as pd
import numpy as np
import pytextrank
import spacy
from icecream import ic

In [None]:
text_pl = "Wikipedia – wielojęzyczna encyklopedia internetowa działająca zgodnie z zasadą otwartej treści. Funkcjonuje w oparciu o oprogramowanie MediaWiki (haw. wiki – „szybko”, „prędko”), wywodzące się z koncepcji WikiWikiWeb, umożliwiające edycję każdemu użytkownikowi odwiedzającemu stronę i aktualizację jej treści w czasie rzeczywistym. Słowo Wikipedia jest neologizmem powstałym w wyniku połączenia wyrazów wiki i encyklopedia. Slogan Wikipedii brzmi: „Wolna encyklopedia, którą każdy może redagować”. Serwis był notowany w rankingu Alexa na miejscu 13[1]. "
text_en = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

## Pytextrank
https://pypi.org/project/pytextrank/

### English pytextrank

In [None]:
# English version from textrank example
if prepare == True:
    !python -m spacy download en_core_web_sm
    
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")


# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
nlp.pipe_names



In [None]:
doc = nlp(text_en)

pytextrank_result_en = doc._.phrases

# examine top-ranked phrases in the document
for phrase in pytextrank_result_en[:5]:
    ic(phrase)

tr = doc._.textrank

# if prepare == True:
#     !pip install "altair"
#     !pip install 'pytextrank[viz]'
# tr.plot_keyphrases()

In [None]:
for chunk in doc.noun_chunks:
    ic(chunk)

### Polski pytextrank

In [None]:
from spacy.matcher import Matcher
from spacy.attrs import POS
from spacy.tokens import Doc

def get_chunks(doc):
    ## For slovak language
    #TODO: change to polish
    np_label = doc.vocab.strings.add("NP")
    nlp = spacy.load('pl_core_news_sm') 
    matcher = Matcher(nlp.vocab)
    pattern = [{POS: 'ADJ', "OP": "+"}, {POS: {"IN": ["NOUN", "PROPN"]}, "OP": "+"}]
    matcher.add("Adjective(s), (p)noun", [pattern])
    matches = matcher(doc)

    for match_id, start, end in matches:
        yield start, end, np_label      

In [None]:
# © https://github.com/explosion/spaCy/discussions/7006
def is_np_root(word, np_deps, conj):
    if word.dep in np_deps:
        return True
    elif word.dep == conj:
        head = word.head
        while head.dep == conj and head.head.i < head.i:
            head = head.head
        return head.dep in np_deps
    else:
        return False

def noun_chunks_pl(doclike):
    labels = [
        "ROOT",
        "nsubj",
        "appos",
        "nsubjpass",
        "iobj",
        "obj",
        "obl",
        "obl:arg",
    ]
    mod_labels = [
        "amod",
        "nmod"
    ]
    doc = doclike.doc

    if not doc.has_annotation("DEP"):
        raise ValueError(Errors.E029)

    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
    mod_deps = [doc.vocab.strings.add(label) for label in mod_labels]
    np_label = doc.vocab.strings.add("NP")
    prev_end = 0
    for i, word in enumerate(doclike):
        if word.pos_ not in ("NOUN", "PROPN"): #TODO PRONs are mostly stop words, should I include it?
            continue
        if is_np_root(word, np_deps, conj):
            start = word.i
            end = start + 1
            while start >= prev_end and doc[start-1].head in [doc[start], word] and doc[start-1].dep in mod_deps:
                start-=1
            while doc[end].head in [doc[end-1], word] and doc[end].dep in mod_deps:
                end+=1
            prev_end = end
            yield start, end, np_label

In [None]:
# List the tokens including their lemmas and POS tags
# spacy_udpipe.download("pl") # download model
nlp_pl = spacy.load('pl_core_news_sm') # or lg
spacy.lang.pl.PolishDefaults.syntax_iterators = {"noun_chunks" : noun_chunks_pl}  #noun_chunk replacement

nlp_pl = spacy.load('pl_core_news_sm') # or lg
# for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
# nlp_pl = spacy_udpipe.load("pl")

nlp_pl.add_pipe("textrank")#, config={ "stopwords": { "strona": ["NOUN"] } })
nlp_pl.pipe_names


# https://derwen.ai/docs/ptr/sample/#scrubber

In [None]:
doc = nlp_pl(text_pl)

for chunk in doc.noun_chunks:
    print(chunk)

# for word in doc:
#     print(word, type(word.pos_))

doc._.phrases 

In [None]:
textrank_object= dict(nlp_pl.pipeline)['textrank']
dir(textrank_object)

# textrank_object.token_lookback

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.tag_, token.pos_, token.sentiment)
    break

## Summa
https://pypi.org/project/summa/

In [None]:
if prepare == True:
    !pip install summa

from summa import keywords

### English summma – comparision with pytextrank

In [None]:
summa_result_en = keywords.keywords(text_en, words=5, split=True, scores=True)

pytextrank_result_en_list = [(phrase.text, phrase.rank) for phrase in pytextrank_result_en] 

In [None]:
summa_result_en

In [None]:
pytextrank_result_en_list[:10]

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text_en)

lemmatized_text_en = " ".join([token.lemma_ for token in doc])
summa_result_en = keywords.keywords(lemmatized_text_en, words=15, split=True, scores=True)
summa_result_en

### Polski – summa

In [None]:
nlp = spacy.load('pl_core_news_sm') # or lg
doc = nlp(text_pl)

lemmatized_text_pl = " ".join([token.lemma_ for token in doc])
print(text_pl, lemmatized_text_pl, sep = "\n\n")


In [None]:
keywords.keywords(lemmatized_text_pl, language="polish", words=10).split("\n")

In [None]:
keywords.keywords(text_pl, language="polish", words=10).split("\n")

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset

In [None]:
polemo_official = load_dataset("data/polemo2-official/", "hotels_text")
df_polemo_official = pd.DataFrame(polemo_official["train"])
vectorizer = TfidfVectorizer()
tfidf_vector = vectorizer.fit_transform(df_polemo_official['text'].values)

In [None]:
tf_idf_results = pd.DataFrame(
    zip(*[vectorizer.get_feature_names_out(), vectorizer.idf_]),
    columns = ["word", "tf-idf"]
    ).sort_values("tf-idf",  ascending=False)

In [None]:
tfidf_vector.todense()

In [None]:
grouped = df_polemo_official.groupby("target")
grouped.get_group(1)["text"].values

In [None]:
"\n".join(tf_idf_results.head()["word"])