In [2]:
import spacy
from spacy.lang.pl.examples import sentences
from utils import read_bills, read_bills_without_normalize
import os
import collections
from typing import List
import matplotlib.pyplot as plt
from operator import itemgetter
import tqdm
import pickle
import nltk
from collections import Counter
import pandas as pd
import math
import requests as rq
import regex

# Read bills

In [3]:
current_dir = os.getcwd()
bills_dict = read_bills(f"{current_dir}/data/first_ex_data/ustawy")

# Use SpaCy tokenizer API to tokenize the text from the law corpus.

In [4]:
# tokenized_bills = {}
# for file_id, file_content in tqdm.tqdm(bills_dict.items()):
#     tokens = nlp(file_content)
#     tokenized_bills[file_id] = [token.text for token in tokens]

In [5]:
# tokenized_bills["1999_700"]

In [6]:
# the tokenization took 15 min, dumping this dict just in case
# with open('tokenized_bills.pickle', 'wb') as handle:
#     pickle.dump(tokenized_bills, handle)

In [7]:
with open('tokenized_bills.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [8]:
tokenized_bills = b

# Compute bigram counts of downcased tokens

In [9]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [10]:
tokens = [token for token in tokenized_bills.values()]
tokens = flatten(tokens)

In [11]:
bigrams = list(nltk.bigrams(tokens[:10000]))

In [12]:
counted_bigrams = dict(Counter(bigrams))

# Discard bigrams containing characters other than letters

In [13]:
clean_counted_bigrams = (
    { token : value for token, value in counted_bigrams.items() if all(elem.isalpha() for elem in list(token))}
)

In [14]:
df = pd.DataFrame(clean_counted_bigrams.items(), columns=["bigrams","count"]) 

In [15]:
df

Unnamed: 0,bigrams,count
0,"(dzu, z)",6
1,"(r, nr)",19
2,"(ustawa, z)",3
3,"(z, dnia)",10
4,"(r, o)",9
...,...,...
4410,"(do, sędziów)",1
4411,"(będącego, prokuratorem)",1
4412,"(prokuratorem, wojskowej)",1
4413,"(wojskowej, jednostki)",1


In [16]:
def calculate_pmi(tokens, ngrams):
    pmi = []
    text_length = len(tokens)
    for ngram, count in tqdm.tqdm(ngrams.items()):
        prob = count / text_length
        freq = [tokens.count(t) / text_length for t in list(ngram)]
        individual_probabilities = math.prod(freq)
        pmi.append(math.log2(prob/individual_probabilities))
    return pmi

In [17]:
# pmi = []
# text_length = len(tokens)
# for bigram, count in tqdm.tqdm(clean_counted_bigrams.items()):
#     prob = count / text_length
#     freq = [tokens.count(t) / text_length for t in list(bigram)]
#     individual_probabilities = math.prod(freq)
#     pmi.append(math.log2(prob/individual_probabilities))

In [18]:
df["pmi"] = calculate_pmi(tokens, clean_counted_bigrams)

100%|███████████████████████████████████████| 4415/4415 [04:38<00:00, 15.86it/s]


In [19]:
df

Unnamed: 0,bigrams,count,pmi
0,"(dzu, z)",6,-4.278401
1,"(r, nr)",19,-4.259480
2,"(ustawa, z)",3,-4.444596
3,"(z, dnia)",10,-5.180098
4,"(r, o)",9,-5.864568
...,...,...,...
4410,"(do, sędziów)",1,-2.848001
4411,"(będącego, prokuratorem)",1,9.812570
4412,"(prokuratorem, wojskowej)",1,7.197736
4413,"(wojskowej, jednostki)",1,-0.276717


# Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [20]:
df.sort_values(by="pmi", ascending=False).head(10)

Unnamed: 0,bigrams,count,pmi
265,"(kazeinę, kazeiniany)",1,21.961364
3046,"(doktorów, habilitowanych)",1,21.961364
259,"(bezwodny, tłuszcz)",1,19.376401
262,"(sery, maślankę)",1,19.376401
4355,"(zebrań, żołnierskich)",1,19.154009
257,"(śmietankę, masło)",1,18.961364
254,"(smakowych, śmietanę)",1,18.961364
1234,"(hurtowymi, sprzedającymi)",1,18.791439
1842,"(cukierniczych, piekarniczych)",1,18.791439
260,"(tłuszcz, mleczny)",1,18.376401


# Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [21]:
df_filtered = df[df["count"] >= 5]

In [22]:
top_bigrams_spacy_df = df_filtered.sort_values(by="pmi", ascending=False).head(10)
top_bigrams_spacy_df

Unnamed: 0,bigrams,count,pmi
212,"(przetwory, mleczne)",8,15.247118
219,"(indywidualna, reprezentatywna)",5,15.177383
437,"(średniej, ważonej)",9,13.694577
464,"(dostawcy, hurtowemu)",6,13.20759
281,"(mlecznej, przeznaczonej)",28,12.940582
472,"(podmiotom, skupującym)",5,12.91697
298,"(dostawców, hurtowych)",40,12.715336
277,"(współczynnik, przydziału)",5,12.588499
342,"(indywidualnej, reprezentatywnej)",14,12.431376
233,"(dostawcę, hurtowego)",6,12.292479


# Use KRNNT or Clarin-PL API(https://ws.clarin-pl.eu/tager.shtml) to tag and lemmatize the corpus.

In [23]:
import xml.etree.ElementTree as ET

In [24]:
# bills_text = ' '.join(bills_dict.values())

In [25]:
# with open('bills_text.txt', 'w') as f:
#     f.write(bills_text)

In [26]:
with open('results.txt', 'r') as f:
    results = f.read()

In [27]:
xml_tree = ET.fromstring(results)

In [28]:
match = regex.compile(r'\w+\b')
tags = []
for tok in xml_tree.iter('tok'):
    for lex in tok.iter('lex'):
        base = lex.find("base").text
        ctag = match.findall(lex.find("ctag").text)[0]
        tags.append(f"{base}:{ctag}")

In [29]:
# <tok>
#     <orth>skupujących</orth>
#     <lex disamb="1"><base>skupować</base><ctag>pact:pl:gen:m1:imperf:aff</ctag></lex>
#     <lex disamb="1"><base>skupywać</base><ctag>pact:pl:gen:m1:imperf:aff</ctag></lex>
# </tok>
# dunno what to do in case like that

# Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.

In [40]:
bigrams_clairn = list(nltk.bigrams(tags[:10000]))

In [41]:
len(bigrams_clairn)

9999

In [42]:
counted_bigrams_clairn = dict(Counter(bigrams_clairn))

In [43]:
len(counted_bigrams_clairn)

4930

In [44]:
clean_counted_bigrams_clairn = (
    { token : value for token, value in counted_bigrams_clairn.items() 
     if all(elem.split(":")[0].isalpha() for elem in list(token))}
)

In [45]:
len(clean_counted_bigrams_clairn)

3806

In [46]:
df_clairn = pd.DataFrame(clean_counted_bigrams_clairn.items(), columns=["bigrams","count"]) 

In [47]:
df_clairn

Unnamed: 0,bigrams,count
0,"(dzu:ign, z:prep)",5
1,"(r:ign, nr:subst)",18
2,"(ustawa:subst, z:prep)",8
3,"(z:prep, dzień:subst)",12
4,"(r:ign, o:prep)",8
...,...,...
3801,"(odpowiedzialność:subst, żołnierz:subst)",1
3802,"(żołnierz:subst, za:qub)",1
3803,"(dyscyplinarny:adj, i:conj)",1
3804,"(za:qub, naruszenie:subst)",1


In [48]:
# pmi_clairn = []
# text_length = len(tags)
# for bigram, count in tqdm.tqdm(clean_counted_bigrams_clairn.items()):
#     prob = count / text_length
#     freq = [tags.count(t) / text_length for t in list(bigram)]
#     individual_probabilities = math.prod(freq)
#     pmi_clairn.append(math.log2(prob/individual_probabilities))

In [49]:
df_clairn["pmi"] = calculate_pmi(tags, clean_counted_bigrams_clairn)

100%|███████████████████████████████████████| 3806/3806 [04:11<00:00, 15.15it/s]


In [50]:
df_clairn

Unnamed: 0,bigrams,count,pmi
0,"(dzu:ign, z:prep)",5,-4.615191
1,"(r:ign, nr:subst)",18,-4.317902
2,"(ustawa:subst, z:prep)",8,-6.046463
3,"(z:prep, dzień:subst)",12,-5.578939
4,"(r:ign, o:prep)",8,-6.014913
...,...,...,...
3801,"(odpowiedzialność:subst, żołnierz:subst)",1,0.322438
3802,"(żołnierz:subst, za:qub)",1,-2.625909
3803,"(dyscyplinarny:adj, i:conj)",1,-5.900086
3804,"(za:qub, naruszenie:subst)",1,-1.645894


In [51]:
df_clairn_filtered = df_clairn[df_clairn["count"] >= 5]

In [52]:
top_bigrams_clairn_df = df_clairn_filtered.sort_values(by="pmi", ascending=False).head(10)
top_bigrams_clairn_df

Unnamed: 0,bigrams,count,pmi
428,"(skupować:pact, skupywać:pact)",33,12.399572
215,"(średnia:subst, ważyć:ppas)",14,11.825854
660,"(mleko:subst, skupiony:adj)",10,11.360357
268,"(współczynnik:subst, przydział:subst)",7,11.226653
213,"(zawartość:subst, tłuszcz:subst)",40,10.658903
178,"(przetwór:subst, mleczny:adj)",25,10.562416
225,"(dostawca:subst, hurtowy:adj)",71,10.478294
216,"(ważyć:ppas, zawartość:subst)",13,10.452452
240,"(odtłuścić:ppas, mleko:subst)",5,10.074955
2258,"(osiem:num, osiem:num)",22,9.638616


# Compute trigram counts for both corpora and perform the same filtering.

In [53]:
trigrams_clairn = list(nltk.trigrams(tags[:10000]))
trigrams_spacy = list(nltk.trigrams(tokens[:10000]))

In [54]:
counted_trigrams_clairn = dict(Counter(trigrams_clairn))
counted_trigrams_spacy = dict(Counter(trigrams_spacy))

In [55]:
clean_counted_trigrams_clairn = (
    { token : value for token, value in counted_trigrams_clairn.items() 
     if all(elem.split(":")[0].isalpha() for elem in list(token))}
)

In [56]:
clean_counted_trigrams_spacy = (
    { token : value for token, value in counted_trigrams_spacy.items() 
     if all(elem.isalpha() for elem in list(token))}
)

In [57]:
df_trigrams_spacy = pd.DataFrame(clean_counted_trigrams_spacy.items(), columns=["trigrams","count"])

In [58]:
df_trigrams_clairn = pd.DataFrame(clean_counted_trigrams_clairn.items(), columns=["trigrams","count"]) 

In [59]:
df_trigrams_spacy["pmi"] = calculate_pmi(tokens, clean_counted_trigrams_spacy)
df_trigrams_clairn["pmi"] = calculate_pmi(tags, clean_counted_trigrams_clairn)

100%|███████████████████████████████████████| 5230/5230 [08:11<00:00, 10.65it/s]
100%|███████████████████████████████████████| 4860/4860 [08:01<00:00, 10.08it/s]


In [60]:
df_trigrams_spacy_filtered = df_trigrams_spacy[df_trigrams_spacy["count"] >= 5]
top_trigrams_spacy_df = df_trigrams_spacy_filtered.sort_values(by="pmi", ascending=False).head(10)
top_trigrams_spacy_df

Unnamed: 0,trigrams,count,pmi
206,"(indywidualna, reprezentatywna, zawartość)",5,28.491289
725,"(podaniem, średniej, ważonej)",5,27.504163
460,"(średniej, ważonej, zawartości)",9,27.233876
461,"(ważonej, zawartości, tłuszczu)",9,27.068817
342,"(reprezentatywnej, zawartości, tłuszczu)",14,26.99004
268,"(współczynnik, przydziału, indywidualnej)",5,26.736081
341,"(indywidualnej, reprezentatywnej, zawartości)",14,25.970675
267,"(krajowy, współczynnik, przydziału)",5,25.946236
207,"(reprezentatywna, zawartość, tłuszczu)",5,25.9448
487,"(indywidualną, kwotę, mleczną)",13,25.881534


In [61]:
df_trigrams_clairn_filtered = df_trigrams_clairn[df_trigrams_clairn["count"] >= 5]
top_trigrams_clairn_df = df_trigrams_clairn_filtered.sort_values(by="pmi", ascending=False).head(10)
top_trigrams_clairn_df

Unnamed: 0,trigrams,count,pmi
212,"(ważyć:ppas, zawartość:subst, tłuszcz:subst)",13,25.345933
208,"(reprezentatywny:adj, zawartość:subst, tłuszcz...",21,24.343913
211,"(średnia:subst, ważyć:ppas, zawartość:subst)",13,24.143377
269,"(współczynnik:subst, przydział:subst, indywidu...",7,23.592886
675,"(podanie:subst, średnia:subst, ważyć:ppas)",5,23.426553
2860,"(osiem:num, osiem:num, osiem:num)",21,23.151566
758,"(ilość:subst, mleko:subst, skupiony:adj)",10,23.029552
473,"(podmiot:subst, skupować:pact, skupywać:pact)",33,21.900482
207,"(indywidualny:adj, reprezentatywny:adj, zawart...",21,21.816666
268,"(krajowy:adj, współczynnik:subst, przydział:su...",7,21.208301


# Devise a method for computing the values, based on the results for bigrams.

`
def calculate_pmi(tokens, ngrams):
    pmi = []
    text_length = len(tokens)
    for ngram, count in tqdm.tqdm(ngram.items()):
        prob = count / text_length
        freq = [tokens.count(t) / text_length for t in list(ngram)]
        individual_probabilities = math.prod(freq)
        pmi.append(math.log2(prob/individual_probabilities))
    return pmi
`

# Create a table comparing the results for copora without and with tagging and lemmatization (separate table for bigrams and trigrams).

In [62]:
pd.concat([top_bigrams_spacy_df.reset_index(drop=True), top_bigrams_clairn_df.reset_index(drop=True)], axis=1)

Unnamed: 0,bigrams,count,pmi,bigrams.1,count.1,pmi.1
0,"(przetwory, mleczne)",8,15.247118,"(skupować:pact, skupywać:pact)",33,12.399572
1,"(indywidualna, reprezentatywna)",5,15.177383,"(średnia:subst, ważyć:ppas)",14,11.825854
2,"(średniej, ważonej)",9,13.694577,"(mleko:subst, skupiony:adj)",10,11.360357
3,"(dostawcy, hurtowemu)",6,13.20759,"(współczynnik:subst, przydział:subst)",7,11.226653
4,"(mlecznej, przeznaczonej)",28,12.940582,"(zawartość:subst, tłuszcz:subst)",40,10.658903
5,"(podmiotom, skupującym)",5,12.91697,"(przetwór:subst, mleczny:adj)",25,10.562416
6,"(dostawców, hurtowych)",40,12.715336,"(dostawca:subst, hurtowy:adj)",71,10.478294
7,"(współczynnik, przydziału)",5,12.588499,"(ważyć:ppas, zawartość:subst)",13,10.452452
8,"(indywidualnej, reprezentatywnej)",14,12.431376,"(odtłuścić:ppas, mleko:subst)",5,10.074955
9,"(dostawcę, hurtowego)",6,12.292479,"(osiem:num, osiem:num)",22,9.638616


In [63]:
pd.concat([top_trigrams_spacy_df.reset_index(drop=True), top_trigrams_clairn_df.reset_index(drop=True)], axis=1)

Unnamed: 0,trigrams,count,pmi,trigrams.1,count.1,pmi.1
0,"(indywidualna, reprezentatywna, zawartość)",5,28.491289,"(ważyć:ppas, zawartość:subst, tłuszcz:subst)",13,25.345933
1,"(podaniem, średniej, ważonej)",5,27.504163,"(reprezentatywny:adj, zawartość:subst, tłuszcz...",21,24.343913
2,"(średniej, ważonej, zawartości)",9,27.233876,"(średnia:subst, ważyć:ppas, zawartość:subst)",13,24.143377
3,"(ważonej, zawartości, tłuszczu)",9,27.068817,"(współczynnik:subst, przydział:subst, indywidu...",7,23.592886
4,"(reprezentatywnej, zawartości, tłuszczu)",14,26.99004,"(podanie:subst, średnia:subst, ważyć:ppas)",5,23.426553
5,"(współczynnik, przydziału, indywidualnej)",5,26.736081,"(osiem:num, osiem:num, osiem:num)",21,23.151566
6,"(indywidualnej, reprezentatywnej, zawartości)",14,25.970675,"(ilość:subst, mleko:subst, skupiony:adj)",10,23.029552
7,"(krajowy, współczynnik, przydziału)",5,25.946236,"(podmiot:subst, skupować:pact, skupywać:pact)",33,21.900482
8,"(reprezentatywna, zawartość, tłuszczu)",5,25.9448,"(indywidualny:adj, reprezentatywny:adj, zawart...",21,21.816666
9,"(indywidualną, kwotę, mleczną)",13,25.881534,"(krajowy:adj, współczynnik:subst, przydział:su...",7,21.208301


# Why do we have to filter the bigrams, rather than the token sequence?
# Which method works better for the bigrams and which for the trigrams?
# What types of expressions are discovered by the methods.
# Can you devise a different type of filtering that would yield better results?
