In [1]:
import spacy
import nltk
nlp = spacy.load('en_core_web_md')
nltk.download('stopwords', quiet=True)

True

# Automatyczna sumaryzacja

In [3]:
from nltk import word_tokenize, sent_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
from string import punctuation

stopwords = set(stopwords.words('english') + list(punctuation)) # stworzenie listy ignorowanych tokenów

def compute_frequencies(word_sent): # funkcja generująca słownik mapowane słowo -> liczność  wystąpień w dokumencie
    dict = {}
    for i in word_sent:
        for j in i:
            if j not in stopwords:
                if j in dict:
                    dict[j] += 1
                else:
                    dict[j] = 1
    return dict

def create_sentence_ranking(tokenized_sentences, freq):
    counter = 0
    dict = {}
    for i in tokenized_sentences:
        result = 0
        for j in i:
            if j not in stopwords:
                result += freq[j]
        dict[counter] = result
        counter += 1        
    return dict

def summarize(text, in_how_many_sentences):
    text_lowercased = text.lower() # zamiana na małe litery
    sents = sent_tokenize(text) # podział  na zdania
    sentences_with_words_tokenized = [] # podział zdania na słowa tworząc listę list
    for i in sents:
        sentences_with_words_tokenized.append(word_tokenize(i))
        
    freq = compute_frequencies(sentences_with_words_tokenized)

    ranking = create_sentence_ranking(sentences_with_words_tokenized, freq) # stworzenie rankingu zdań
    sents_idx = get_top_n(ranking, in_how_many_sentences) # wybór liczby najistotniejszych zdań
    return [sents[i] for i in sents_idx] # zamiana indeksu na tekst

def get_top_n(ranking, n):
    return sorted(range(len(ranking)), key=lambda i: ranking[i])[-n:]
    
text = '''
Washington (CNN) As preparations are underway for a US-North Korea summit, US officials are trying to solve the logistical issue of who will pay for North Korean leader Kim Jong Un's housing, according to a new report.

A week after abruptly scrapping the summit with Kim, President Donald Trump announced Friday that the historic talks were back on for June 12 in Singapore.
With its economy weakened from tough sanctions, Pyongyang is requiring that another country pay for Kim and his delegation's hotel bill, The Washington Post reported Friday.
According to the Post, Kim is demanding to stay at the luxury, five-star Fullerton hotel, where a presidential suite costs more than $6,000 a night.
America should be more at ease than this
America should be more at ease than this
White House and State Department officials declined to comment to the Post on the advance team planning details.
Citing two people familiar with the talks, the Post reported that the US is open to covering the costs, but is considering asking the host country, Singapore, to foot the bill.
The International Campaign to Abolish Nuclear Weapons also offered to pay for Kim's lodging with the cash received as part of its Nobel Peace Prize ($1.1 million) it won last year "in order to support peace in the Korean Peninsula and a nuclear-weapon-free world."
"Our movement is committed to the abolition of nuclear weapons and we recognize that this historic summit is a once in a generation opportunity to work for peace and nuclear disarmament," ICAN International Steering Group member Akira Kawasaki said in a statement.
The Post is also reporting that the US is expected to request a waiver of sanctions from the United Nations and US Treasury Department for expenses associated with North Korea's travel.
Trump is expected to stay at another five-star hotel, the Shangri-La, which has hosted high security events before, according to the Post.
Determining who will pay Kim's hotel bill is one of many logistical issues still being hammered out ahead of the summit, including the aircraft Kim will use to fly to Singapore and the venue where Trump and Kim will meet, the Post reported.
The relatively secluded Capella hotel on the island of Sentosa is being considered for the site of the summit, people familiar with the talks told the Post.
'''

for s in summarize(text, 2): # wybór 2 najlepszych zdań
    print('*', s)

* With its economy weakened from tough sanctions, Pyongyang is requiring that another country pay for Kim and his delegation's hotel bill, The Washington Post reported Friday.
* Determining who will pay Kim's hotel bill is one of many logistical issues still being hammered out ahead of the summit, including the aircraft Kim will use to fly to Singapore and the venue where Trump and Kim will meet, the Post reported.


# Ekstrakcja relacji

In [5]:
from nltk import Tree

doc = nlp("The quick brown fox jumps over the lazy dog.")

def to_nltk_tree(node): # stworzenie drzwa
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.text, [to_nltk_tree(child) for child in node.children])
    else:
        return node.text

for sent in doc.sents:
    print(sent)
    print("-----------------------------------")
    to_nltk_tree(sent.root).pretty_print() # stworzenie i wyświetlenie drzewa
    print("\n\n\n")

The quick brown fox jumps over the lazy dog.
-----------------------------------
        jumps                    
  ________|______________         
 |        |             over     
 |        |              |        
 |       fox            dog      
 |    ____|_____      ___|____    
 .  The quick brown the      lazy







## Prosta ekstrakcja relacji z wykorzystaniem drzewa zależnościowego

In [7]:
from nltk import Tree

doc = nlp("The quick brown fox jumps over the lazy dog.")

# wyświetlenie CONLL
i = 0
for sent in doc.sents:
    for word in sent:
        print(i, word, word.dep_, word.head, list(word.children));
        i += 1
    print()

0 The det fox []
1 quick amod fox []
2 brown amod fox []
3 fox nsubj jumps [The, quick, brown]
4 jumps ROOT jumps [fox, over, .]
5 over prep jumps [dog]
6 the det dog []
7 lazy amod dog []
8 dog pobj over [the, lazy]
9 . punct jumps []



## Ekstrakcja relacji 

In [9]:
from nltk import Tree

doc = nlp("The quick brown fox jumps over the lazy dog.")

def parse(sent): #funkcja ekstrahująca najważniejszą relację ze zdania wraz z jej argumentami (podmiotem i dopełnieniem) na podstawie drzewa zależnościowego
    for i in sent:
        if i.dep_ == "ROOT":
            predicate = i.text
    for i in sent:
        if i.dep_ == "nsubj" and i.head.text == predicate:
            subj = i.text
    obj = None
    for i in sent:
        if i.dep_ == "dobj" and i.head.text == predicate:
            obj = i.text
    if obj == None:
        for i in sent:
            if i.dep_ == "prep" and i.head.text == predicate:
                predicate2 = i.text
        for i in sent:
            if i.dep_ == "pobj" and i.head.text == predicate2:
                obj = i.text
        predicate += " " + predicate2
    print("{pred}({subj}, {obj})".format(pred=predicate, subj=subj, obj=obj))
    
for sent in doc.sents:
    parse(sent)

jumps over(fox, dog)
