In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Spracovanie textu v Pythone (pokračovanie)
- [NLTK](http://www.nltk.org/book/)
- gensim
- word2vec
- scikit-learn

In [None]:
import nltk
from nltk.corpus import inaugural

In [None]:
inaugural.fileids()

In [None]:
inaugural.words('1789-Washington.txt')

In [None]:
len(inaugural.words('1789-Washington.txt'))

In [None]:
inaugural.sents('1789-Washington.txt')

In [None]:
texts = []
for fileid in inaugural.fileids():
    year = fileid[:4]
    name = fileid[5:].split('.')[0]
    text = ' '.join(inaugural.words(fileid))
    texts.append({'name': name, 'year': year, 'text': text})

In [None]:
texts[0]

## Tokenizácia

In [None]:
text = texts[0]['text']

In [None]:
sentences = nltk.sent_tokenize(text)

In [None]:
sentences[:5]

In [None]:
sent = sentences[0]

In [None]:
tokens = nltk.word_tokenize(sent)

In [None]:
tokens

## Stemming

Stemming vráti korene slov. Napr. *ryba -> ryb*

In [None]:
porter = nltk.PorterStemmer()

In [None]:
[porter.stem(token) for token in tokens]

## Lematizácia

Lematizácia prevádza slová na ich základný slovníkový tvar. Napr. *rybe -> ryba*

In [None]:
wnl = nltk.WordNetLemmatizer()

In [None]:
[wnl.lemmatize(token) for token in tokens]

## Part-of-Speech Tagging (POS)

In [None]:
tagged = nltk.pos_tag(tokens)
tagged

In [None]:
nltk.help.upenn_tagset('IN')

In [None]:
nltk.help.upenn_tagset('NNP')

## Name entity (menné entity)

In [None]:
entities = nltk.chunk.ne_chunk(tagged)

In [None]:
print(entities.__repr__())

## N-gramy

In [None]:
tokens = nltk.word_tokenize(text)

In [None]:
bigrams = list(nltk.bigrams(tokens))
bigrams[:5]

In [None]:
nltk.FreqDist(bigrams).most_common(10)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords[:10]

In [None]:
tokens_cleared = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords]

In [None]:
tokens_cleared[:10]

In [None]:
nltk.FreqDist(nltk.bigrams(tokens_cleared)).most_common(10)

In [None]:
nltk.FreqDist(nltk.trigrams(tokens_cleared)).most_common(10)

## WordNet

* Lexikálna databáza
* Obsahuje synsety: podstatné mená, slovesá, prídavné mená, príslovky
* Prepojenia medzi synsetmi: antonymá, hyperonymá, hyponymá, holonymá, meronymá

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
wn.synsets('car')

In [None]:
car = wn.synset('car.n.01')

In [None]:
car.lemma_names()

In [None]:
car.definition()

In [None]:
car.examples()

In [None]:
car.hyponyms()

In [None]:
car.hypernyms()

In [None]:
car.part_meronyms()

In [None]:
wn.synsets('black')[0].lemmas()[0].antonyms()

# Reprezentácia textu

Textový dokument väčšinou reprezentujeme pomocou množiny slov (angl. *bag-of-words*) = vektorom. Zložky vektoru predstavujú jednotlivé slová, resp. n-gramy zo slovníka (pre celý korpus/jazyk). Hodnotou zložiek vektora môže byť:

* početnosť
* frekvencia
* váhovaná frekvencia

Slová s vysokou frekvenciou výskytu v jazyku (spojky a pod.) sa označujú ako tzv. *stop slová* a zvyknú sa pri predspracovaní odstraňovať.

## TF-IDF
* Term frequency * inverse document frequency
* `TF` – frekvencia slova v aktuálnom dokumente
* `IDF` – záporný logaritmus pravdepodobnosti výskytu slova v dokumente (rovnaká pre všetky dokumenty)
* Rôzne varianty (váhovacie schémy): https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## Gensim
- Knižnica na modelovanie tém v dokumentoch.
- Implementuje TF-IDF, LSA, pLSA, LDA, HDP, DTM, word2vec
- https://radimrehurek.com/gensim/tutorial.html

In [None]:
from gensim import corpora, models, similarities

In [None]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    return [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords]

In [None]:
tokenized_docs = [preprocess_text(text['text']) for text in texts]

In [None]:
tokenized_docs[4][:10]

Odstránenie slov, ktoré sa v korpuse vyskytujú len raz

In [None]:
from collections import defaultdict

frequency = defaultdict(int)
for text in tokenized_docs:
    for token in text:
        frequency[token] += 1

tokenized_docs = [[token for token in doc if frequency[token] > 1] for doc in tokenized_docs]

In [None]:
dictionary = corpora.Dictionary(tokenized_docs)

In [None]:
# print(dictionary.token2id)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

Trénovanie TF-IDF modelu

In [None]:
tfidf_model = models.TfidfModel(corpus)

In [None]:
tfidf_corpus = tfidf_model[corpus]

In [None]:
tfidf_corpus[0][:10]

Ďalšie modely: LSI, LDA, ...

Môžeme vypočítať podobnosť výsledných vektorov:

In [None]:
index = similarities.MatrixSimilarity(tfidf_corpus)

In [None]:
index[tfidf_corpus[0]]

## word2vec

Každé slovo má naučený vektor reálnych čísel, ktoré reprezentujú rôzne jeho vlastnosti a zachytávajú viaceré lingvistické pravidelnosti. Môžeme počítať podobnosť medzi slovami ako podobnosť dvoch vektorov.

vector('Paris') - vector('France') + vector('Italy') ~= vector('Rome')

vector('king') - vector('man') + vector('woman') ~= vector('queen')

https://radimrehurek.com/gensim/models/word2vec.html

https://medium.com/@mishra.thedeepak/word2vec-in-minutes-gensim-nlp-python-6940f4e00980

In [None]:
from nltk.corpus import brown

In [None]:
sentences = brown.sents()
model = models.Word2Vec(sentences, min_count=1)

In [None]:
model.save('brown_model')

In [None]:
model = models.Word2Vec.load('brown_model')

In [None]:
print(model.wv.most_similar("mother"))

In [None]:
print(model.wv.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
print(model.wv.doesnt_match("pizza pasta garden fries".split()))

In [None]:
model.wv['human']

## Extrakcia čŕt pomocou scikit-learn

http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
docs = [text['text'] for text in texts]

In [None]:
vectorizer = CountVectorizer(stop_words='english')
tf = vectorizer.fit_transform(docs)

In [None]:
tf.toarray()[0][:100]

In [None]:
vectorizer.get_feature_names()[:100]

In [None]:
transformer = TfidfVectorizer(stop_words='english')
tfidf = transformer.fit_transform(docs)

In [None]:
tfidf.toarray()[0][:100]