# Warsztaty Python w Data Science

---
## Przetwarzanie języka naturalnego - część 1 z 2  
- ### Tokenizacja
- ### Statystyki tekstu 
- ### Prosta Lematyzacja
- ### Metryka TF-IDF
- ### TF-IDF dla zlematyzowanego korpusu
- ### Stoplista
---

In [None]:
import pandas as pd

data = pd.read_csv('data/gumtree-2022-03-20.tsv', sep='\t', index_col=0)

In [None]:
data

---
# Tokenizacja

In [None]:
opis = data['description'][6]
opis

In [None]:
import re

def no_tags(s):
    return re.sub(r'<[^<]+?>','',s)

opis = no_tags(opis)
opis

In [None]:
import re

tokenizer = re.compile(r'[^ąąćęńłóóśśżżź\w]+')
tokenized = tokenizer.split(opis)
str(tokenized)

In [None]:
tokenized = [ x.lower() for x in tokenized ]
str(tokenized)

In [None]:
def preprocessing(opis):
    opis = no_tags(opis)
    tokenized = tokenizer.split(opis)
    l = list(tokenized)
    l = [ x.lower() for x in l ]
    return l

In [None]:
corpus=[]
n=4
for row in data.iterrows():
    opis = row[1][2]
    l = preprocessing(opis)
    corpus.append(l)
    n-=1
    if n==0: break

for opis in corpus:
    print(opis)
    print()

---
# Statystyki tekstu

In [None]:
corpus = []
for row in data.iterrows():
    opis = row[1][2]
    if type(opis) == str:
        l = preprocessing(opis)
        corpus.append(l)

    
print(f"Mamy tekstów: {len(corpus)}")

In [None]:
all_words = []
for t in corpus:
    all_words += t
 
print(f"Mamy {len(all_words)} wyrazów")
all_words[:15]

In [None]:
counter = {}

for w in all_words:
    counter[w] = counter.get(w,0)+1

print(f"Mamy {len(counter.keys())} RÓŻNYCH wyrazów")

In [None]:
counted_words= [ (word,cnt) for word,cnt in counter.items() ]
counted_words[:4]

In [None]:
from operator import itemgetter

counted_words.sort(key=itemgetter(1), reverse=True)
counted_words[:20]

In [None]:
counted_words[-20:]

In [None]:
counts = [ x[1] for x in counted_words ]
len(counts)

In [None]:
sum(counts)

In [None]:
sum(counts[:175])

In [None]:
counted_words[165:175]

In [None]:
count_df = pd.DataFrame(counts[:120])
count_df

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.dates as mdates

plt.figure(figsize=(24,12))
plt.style.use("dark_background")

chart = sns.scatterplot(
                     color='purple', 
                     data=count_df
                    )

---
# Prosta Lematyzacja

## _Lematyzacja_ - sprowadzenie wyrazu do formy podstawowej tak aby różne formy tego wyrazu (*kot*, *kota*, *kotu*) były rozpatrywane jako ten sam wyraz (*kot*) 

https://sjp.pl/
    
Słownik SJP.PL
Słownik języka polskiego, ortograficzny, wyrazów obcych i słownik do gier w jednym.

Słownik jest rozwijany z myślą o zastosowaniu do sprawdzania pisowni w programach open-source, do gier słownych (np. literaki) i do użytku online jako kilka rodzajów słowników w jednym.

Redakcją słownika zajmują się hobbyści.

Słownik jest udostępniany na otwartych licencjach (różnych w zależności od wersji).

In [None]:
import gzip
import sys
import re

f = gzip.open('data/odm.txt.gz', 'rt', encoding='utf-8')
dictionary = {}

for x in f:
    t = x.strip().split(',')
    tt = [ x.strip().lower() for x in t]
    for w in tt[1:]: 
        dictionary[w]=tt[0]


In [None]:
def lematize(w):
    return dictionary.get(w,w)

In [None]:
corpusl = [[ lematize(x) for x in l ] for l in corpus]
for opis in corpusl[:7]:
    print(opis)
    print()

In [None]:
all_words = []
for t in corpusl:
    all_words += t
 
print(f"Mamy {len(all_words)} wyrazów")
all_words[:15]

In [None]:
counter = {}

for w in all_words:
    counter[w] = counter.get(w,0)+1

print(f"Mamy {len(counter.keys())} RÓŻNYCH wyrazów")

In [None]:
from operator import itemgetter
counted_words= [ (word,cnt) for word,cnt in counter.items() ]
counted_words.sort(key=itemgetter(1), reverse=True)
counted_words[:20]

In [None]:
counts = [ x[1] for x in counted_words ]
len(counts)

In [None]:
sum(counts)

In [None]:
sum(counts[:105])

In [None]:
counted_words[95:105]

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.dates as mdates

counts = [ x[1] for x in counted_words ]
count_df = pd.DataFrame(counts[:120])
count_df

plt.figure(figsize=(24,12))
plt.style.use("dark_background")

chart = sns.scatterplot(
        color='purple', 
        data=count_df
        )

---
## Metryka TF-IDF
ile razy występuję wyraz *i* w tekście *j*
$${n}_{ij}$$ 
 ### Term Frequency (TF)
 
 $${tf}_{ij} = \frac{{n}_{ij}}{\sum{k}{{n}_{ik}}}$$
 
 W tekście *j* sprawdzamy ile proporcjonalnie do całości występuje wyraz *i*
### Inverted Document Frequency (IDF)

 $$idf_i = log \frac{|D|}{ \{ d: n_i \in d \}}$$
 
 licznik - liczba dokumentów
 
 mianownik - liczba dokumentów w którym wystapił wyraz *i*-ty 

---

## [Dokumentacja do `TfidfVectorizer` z biblioteki Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

vocabulary = ['life', 'learning']
corpus = {"Document 1": "The game of life is a game of everlasting learning", 
          "Document 2": "The unexamined life is not worth living", 
          "Document 3": "Never stop learning"}


In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary)
tfs = tfidf.fit_transform(corpus.values())

In [None]:
feature_names = tfidf.get_feature_names()
feature_names

In [None]:
corpus_index = [header for header in corpus]
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)
df

---
## TF-IDF dla zlematyzowanego korpusu

In [None]:
len(counted_words)

In [None]:
from operator import itemgetter

vocabulary = list(filter(lambda x: x[1]>1, counted_words) )
len(vocabulary)

In [None]:
vocabulary = list(map(itemgetter(0), vocabulary) )
print(vocabulary[:50])

In [None]:
vocabulary = list(filter(lambda x: len(x)>2, vocabulary))
print(vocabulary[:50])

In [None]:
len(vocabulary)

In [None]:
vocabulary = list(filter(lambda x: x.isnumeric() is False, vocabulary))
print(vocabulary[:50])

In [None]:
len(vocabulary)

In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary)
tfs = tfidf.fit_transform(corpus.values())

In [None]:
set_vocabulary = set(vocabulary)

corpus = [ [ word for word in document if word in set_vocabulary ] for document in corpusl  ]
for text in corpus[:7]:
    print(text)

In [None]:
corpus = [ ' '.join(document) for document in corpus ] 
for text in corpus[:7]:
    print(text)

In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary)
tfs = tfidf.fit_transform(corpus)

In [None]:
feature_names = tfidf.get_feature_names()
print(feature_names[:50])

In [None]:
corpus_index = range(len(corpus))
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)
df

---
## Stop words/Stoplista

In [None]:
tfidf = TfidfVectorizer(vocabulary = vocabulary, stop_words=vocabulary[:50])
tfs = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names()
corpus_index = range(len(corpus))
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)
df

In [None]:
df = pd.DataFrame(df[6])
df

In [None]:
df.loc[~(df==0).all(axis=1)]