# Packages

In [None]:
%pip install nltk spacy huspacy

# Imports

In [None]:
import pprint
import re

import nltk
nltk.download('punkt')

import huspacy
import spacy
huspacy.download()
nlp = spacy.load('hu_core_news_lg')

# Text Processing - 1

### Loading dataset

In [None]:
data = ["lép", "ablak", "eszik", "iszik", "tábla", "toll", "toll"]
pprint.pprint(data)

### Word Frequency

In [None]:
word_freq = {}
for word in data:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

pprint.pprint(word_freq)

### Common words

In [None]:
common_words = []
for word in word_freq:
    if word_freq[word] > 1:
        common_words.append(word)
        
pprint.pprint(common_words)

### Word tokenization - NLTK

In [148]:
from nltk.tokenize import word_tokenize

text = "Hello, világ! Hogy vagy?"
words = word_tokenize(text)
pprint.pprint(words)

['Hello', ',', 'világ', '!', 'Hogy', 'vagy', '?']


#### Word tokenization - spaCy

In [152]:
import spacy

text = "Hello, világ! Hogy vagy?"
doc = nlp(text)
tokens = [token.text for token in doc]
pprint.pprint(tokens)

['Hello', ',', 'világ', '!', 'Hogy', 'vagy', '?']


### Sentene tokenization - NLTK

In [150]:
from nltk.tokenize import sent_tokenize

text = "Hello, világ! Hogy vagy? Én jól vagyok."
sents = sent_tokenize(text)
pprint.pprint(sents)

['Hello, világ!', 'Hogy vagy?', 'Én jól vagyok.']


#### Sentence tokenization - spaCy

In [158]:
doc = nlp(text)

text = "Hello, világ! Hogy vagy? Én jól vagyok."
sents = list(doc.sents)

pprint.pprint(sents)

[Hello, világ!, Hogy vagy?, Én jól vagyok., asd.]


### Normalization methods

#### Lowercasing

In [None]:
pprint.pprint(text.lower())

#### Removing punctuation

In [None]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

pprint.pprint(remove_punctuation(text))

#### Removing stopwords - NLTK

In [160]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("hungarian"))
words = word_tokenize(text)
filtered_words = [word for word in words if word.lower() not in stop_words]
pprint.pprint(filtered_words)

['Hello', ',', 'világ', '!', '?', '.']


#### Removing stopwords - spaCy

In [163]:
text = "Hello, világ! Hogy vagy? Én jól vagyok."
doc = nlp(text)

filtered_words = [token.text for token in doc if not token.is_stop]
pprint.pprint(filtered_words)

['Hello', ',', 'világ', '!', '?', '.']


#### Stemming - Lemmatization - NLTK

In [165]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
stemmer = SnowballStemmer("hungarian")
lemmatizer = WordNetLemmatizer()

words = word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

pprint.pprint(stemmed_words)
pprint.pprint(lemmatized_words)

['hell', ',', 'világ', '!', 'hogy', 'vagy', '?', 'én', 'jól', 'vagy', '.']
['Hello', ',', 'világ', '!', 'Hogy', 'vagy', '?', 'Én', 'jól', 'vagyok', '.']


#### Stemming - Lemmatization - spaCy

In [164]:
doc = nlp(text)

stemmed_words = [stemmer.stem(token.text) for token in doc]
lemmatized_words = [token.lemma_ for token in doc]

pprint.pprint(stemmed_words)
pprint.pprint(lemmatized_words)

['hell', ',', 'világ', '!', 'hogy', 'vagy', '?', 'én', 'jól', 'vagy', '.']
['Hello', ',', 'világ', '!', 'hogy', 'vagy', '?', 'én', 'jól', 'van', '.']


#### Removing numbers

In [None]:
another_text = "Nyolc 8"
pprint.pprint(re.sub(r"\d+", "", another_text))

#### Removing special characters

In [None]:
pprint.pprint(re.sub(r"[^a-zA-Z0-9áeéiíóöőúüűÁEÉIÍÓÖŐÚÜŰ]", "", text))

#### Removing URLs

In [None]:
pprint.pprint(re.sub(r"http\S+", "", "https://www.google.com/search?q=python programming"))

#### Removing HTML tags

In [None]:
pprint.pprint(re.sub(r"<.*?>", "", "<p>hello</p>"))

#### Removing whitespace (leading and trailing whitespace removal)

In [None]:
another_text = "    Ez egy     szöveg.    "
pprint.pprint(another_text.strip())
pprint.pprint(re.sub(r"\s+", " ", another_text))

# Text Processing - 2

#### Bag of Words

In [None]:
# Bag of Words
import nltk

# Creating a vocabulary
vocab = set()

# Creating the BoW model
bow_model = []

text_data = [
    "John likes to watch movies. Mary likes movies too.",
    "John also likes to watch football games.",
]

for text in text_data:
    # Creating a dictionary for the word frequency table
    word_freq = dict()
    
    # Tokenizing the text
    tokens = nltk.word_tokenize(text)
    
    # Converting the tokens into lowercase
    tokens = [token.lower() for token in tokens]
    
    # Updating the vocabulary
    vocab.update(tokens)
    
    # Updating the word frequency table
    for token in tokens:
        if token in word_freq:
            word_freq[token] += 1
        else:
            word_freq[token] = 1
            
    # Appending the word frequency table to the BoW model
    bow_model.append(word_freq)
    
# Printing the vocabulary
print("Vocabulary:", vocab)

# Printing the BoW model in a more readable format
for i, item in enumerate(bow_model):
    print("Text", i + 1, ":", item)

#### N-grams
- Speech Recognition
- Machine Translation
- Predictive Text Input
- Named Entity Recognition (NER)
- Search Engine Algorithms

In [None]:
from nltk.util import ngrams

pprint.pprint("N-grams")
n_grams = ngrams(words, 4)
for grams in n_grams:
    pprint.pprint(grams)
      
bigrams = ngrams(words, 2)
for grams in bigrams:
    pprint.pprint(grams)

trigrams = ngrams(words, 3)
for grams in trigrams:
    pprint.pprint(grams)

#### N-gram predict next word

In [172]:
import spacy
from collections import defaultdict, Counter
from itertools import islice
from nltk.util import ngrams

# Process the text
text = "CSAK A KÜZDELEM: HOLLANDIA ÉS FRANCIAORSZÁG ÖSSZEHOZTA AZ IDEI LABDARÚGÓ EB ELSŐ GÓL NÉLKÜLI EREDMÉNYÉT".lower()
doc = nlp(text)

# Tokenize and preprocess the text
tokens = [token.text for token in doc if token.is_alpha and not token.is_stop]

bigrams = ngrams(tokens, 2)

# Count bigrams and unigrams
bigram_counts = Counter(bigrams)
unigram_counts = Counter(tokens)

# Calculate bigram probabilities: P(w2|w1) = count(w1, w2) / count(w1)
bigram_prob = defaultdict(lambda: defaultdict(float))
for (w1, w2), count in bigram_counts.items():
    bigram_prob[w1][w2] = count / unigram_counts[w1]

# Function to predict next word
def predict_next_word(word, bigram_prob):
    if word in bigram_prob:
        next_word = max(bigram_prob[word], key=bigram_prob[word].get, default=None)
    else:
        next_word = None
    return next_word

# Predict next word
previous_word = 'franciaország'
predicted_word = predict_next_word(previous_word, bigram_prob)
probability = bigram_prob[previous_word][predicted_word] if predicted_word else 0
print(f'The predicted next word after "{previous_word}" is "{predicted_word}", Probability: {probability:.2f}')

The predicted next word after "franciaország" is "összehozta", Probability: 1.00


#### N-grams predict next sentence

In [171]:
import spacy
from collections import defaultdict, Counter
from itertools import islice
from nltk.util import ngrams

# Sample text data
text = """
Az ég kék.
Ma szombat van.
Az időjárás szép.
Esik az eső.
"""
# Preprocess the text into sentences
doc = nlp(text.lower())
sentences = [sent.text.strip() for sent in doc.sents]

sentence_bigrams = ngrams(sentences, 2)

# Count sentence bigrams and unigrams
bigram_counts = Counter(sentence_bigrams)
unigram_counts = Counter(sentences)

# Calculate bigram probabilities: P(s2|s1) = count(s1, s2) / count(s1)
bigram_prob = defaultdict(lambda: defaultdict(float))
for (s1, s2), count in bigram_counts.items():
    bigram_prob[s1][s2] = count / unigram_counts[s1]

# Function to predict next sentence
def predict_next_sentence(sentence, bigram_prob):
    if sentence in bigram_prob:
        next_sentence = max(bigram_prob[sentence], key=bigram_prob[sentence].get, default=None)
    else:
        next_sentence = None
    return next_sentence

# Predict next sentence
previous_sentence = 'ma szombat van.'
predicted_sentence = predict_next_sentence(previous_sentence, bigram_prob)
probability = bigram_prob[previous_sentence][predicted_sentence] if predicted_sentence else 0
print(f'The predicted next sentence after "{previous_sentence}" is "{predicted_sentence}", Probability: {probability}')

The predicted next sentence after "ma szombat van." is "az időjárás szép.", Probability: 1.0


#### Part of Speech Tagging (POS)
|**Part of Speech**|**Tag**|
|----|-|
|Noun|n|
|Verb|v|
|Adjective|a|
|Adverb|r|

In [168]:
text = "Amikor a nyáj elbóklászik, a pásztornak meg kell ölnie a szakadár vezér ürüt"

doc = nlp(text)
for token in doc:
    print(token.text, token.pos_, token.tag_)

Amikor ADV ADV
a DET DET
nyáj NOUN NOUN
elbóklászik VERB VERB
, PUNCT PUNCT
a DET DET
pásztornak NOUN NOUN
meg PART PART
kell VERB VERB
ölnie VERB VERB
a DET DET
szakadár NOUN ADJ
vezér NOUN NOUN
ürüt NOUN NOUN


#### Named Entity Recognition (NER)
- Process to locate and classify named entities in text into predefined categories such as names of persons, organizations, locations, expressions of times, quantities, etc.

In [112]:
nltk.download('maxent_ne_chunker')
nltk.download("words")

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/0xbalazstoth/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/0xbalazstoth/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [140]:
import spacy
nlp = spacy.load("hu_core_news_lg")

text = "Lance Stroll ezúttal sem kimagasló teljesítményével hívta fel magára figyelmet, hanem azzal, hogy egyenesen ráhúzta a kormányt a pálya külső ívén érkező Hamiltonra. A két autó kissé össze is ért, az esetet az edzést követően megvizsgálják a sportfelügyelők. Hasonló manőver játszódott le egyébként Leclerc és Norris között is a tréning legvégén."

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)
    
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

Lance Stroll PER
Hamiltonra LOC
Leclerc PER
Norris PER


#### TF-IDF (Term Frequency-Inverse Document Frequency)
- A TF-IDF (Term Frequency-Inverse Document Frequency) egy statisztikai mérőszám, amely a dokumentumok szövegeiben szereplő szavak fontosságát méri.
- Két fő komponense van: TF (Term Frequency) és IDF (Inverse Document Frequency).

- TF (Term Frequency)
A TF egy adott szó gyakoriságát méri egy adott dokumentumban. Minél többször fordul elő egy szó egy dokumentumban, annál magasabb lesz a TF értéke.
    - TF(t, d) = (Number of times term t appears in document d) / (Total number of terms in document d)
- DF (Inverse Document Frequency)
Az IDF egy szó fontosságát méri az egész dokumentumkorpuszban. Az IDF célja, hogy csökkentse azoknak a szavaknak a súlyát, amelyek gyakran előfordulnak a dokumentumkorpuszban (például "az", "és", "van"), mivel ezek a szavak kevés információval bírnak a dokumentumok tartalmát illetően.
    - IDF(t, D) = log(Total number of documents D / Number of documents with term t)
- TF-IDF
A TF-IDF a TF és az IDF szorzata. Ezzel a szorzattal a TF-IDF érték magas lesz, ha egy szó gyakran fordul elő egy adott dokumentumban, de ritkán az egész dokumentumkorpuszban. Ezáltal a TF-IDF képes kiemelni a dokumentumokra jellemző szavakat, miközben figyelmen kívül hagyja a gyakori, általános szavakat.
    - TF-IDF(t, d, D) = TF(t, d) * IDF(t, D)

- Miért hasznos?
  - Szövegbányászat, információkeresés
  - Dokumentum klaszterezés, osztályozás
  - Szöveganalitika

In [173]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

documents = [
    "Az ég kék.",
    "A nap ragyog.",
    "A nap az égen ragyog.",
    "Láthatjuk a ragyogó napot, a fényes napot."
]

# Function to preprocess text using HuSpaCy
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Create a DataFrame to display the TF-IDF scores
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF scores
print(tfidf_df)

     fényes       kék    láthat       nap    ragyog   ragyogó        ég
0  0.000000  0.785288  0.000000  0.000000  0.000000  0.000000  0.619130
1  0.000000  0.000000  0.000000  0.629228  0.777221  0.000000  0.000000
2  0.000000  0.000000  0.000000  0.496816  0.613667  0.000000  0.613667
3  0.464757  0.000000  0.464757  0.593297  0.000000  0.464757  0.000000


# References
- [NLTK book](https://www.nltk.org/book/)
- [Roadmap](https://medium.com/aimonks/roadmap-to-learn-natural-language-processing-in-2023-6e3a9372b8cc)