In [1]:
import spacy
from spacy.lang.pl.examples import sentences
from utils import read_bills, read_bills_without_normalize
import os
import collections
from typing import List
import matplotlib.pyplot as plt
from operator import itemgetter
import tqdm
import pickle
import nltk
from collections import Counter
import pandas as pd
import math
import requests as rq
import regex

# Read bills

In [2]:
current_dir = os.getcwd()
bills_dict = read_bills(f"{current_dir}/data/first_ex_data/ustawy")

# Use SpaCy tokenizer API to tokenize the text from the law corpus.

In [4]:
# tokenized_bills = {}
# for file_id, file_content in tqdm.tqdm(bills_dict.items()):
#     tokens = nlp(file_content)
#     tokenized_bills[file_id] = [token.text for token in tokens]

In [5]:
# tokenized_bills["1999_700"]

In [6]:
# the tokenization took 15 min, dumping this dict just in case
# with open('tokenized_bills.pickle', 'wb') as handle:
#     pickle.dump(tokenized_bills, handle)

In [3]:
with open('tokenized_bills.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [4]:
tokenized_bills = b

# Compute bigram counts of downcased tokens

In [5]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [6]:
tokens = [token for token in tokenized_bills.values()]
tokens = flatten(tokens)

In [20]:
len(tokens)

4083469

In [27]:
counted_tokens = dict(Counter(tokens))

In [28]:
counted_tokens_prob = {token : count/len(tokens) for token, count in counted_tokens.items()}

In [29]:
counted_tokens_prob

{' ': 0.00028848021131053036,
 'dzu': 0.0014120347185199642,
 'z': 0.020193125012091435,
 '2001': 0.000488310306751441,
 'r': 0.008096792212699546,
 'nr': 0.011006328197912118,
 '81': 8.056875171575932e-05,
 'poz': 0.011026898943031035,
 '874': 1.8366736713318995e-05,
 'ustawa': 0.0007922185769011592,
 'dnia': 0.004396751879345723,
 '21': 0.0006012045150826417,
 'czerwca': 0.0003594982599353638,
 'o': 0.01586028937650806,
 'zmianie': 0.00037174275107757643,
 'ustawy': 0.0032078117894368736,
 'państwowej': 0.00040161930946457536,
 'straży': 0.00040333353822448513,
 'pożarnej': 0.00012171024195359386,
 'art': 0.02047499319818517,
 '1': 0.01730073131447796,
 'w': 0.04926668966998403,
 'ustawie': 0.0013417513393636637,
 '24': 0.0006087960995908136,
 'sierpnia': 0.0002671747967230803,
 '1991': 0.00032815236261129934,
 '88': 0.00018978961270429628,
 '400': 2.865210927277763e-05,
 '1992': 0.00017264732510519854,
 '86': 0.00015795393573454335,
 'i': 0.02202024798033241,
 '54': 0.00021403370516

In [30]:
bigrams = list(nltk.bigrams(tokens))

In [31]:
counted_bigrams = dict(Counter(bigrams))

# Discard bigrams containing characters other than letters

In [32]:
clean_counted_bigrams = (
    { token : value for token, value in counted_bigrams.items() if all(elem.isalpha() for elem in list(token))}
)

In [33]:
df = pd.DataFrame(clean_counted_bigrams.items(), columns=["bigrams","count"]) 

In [34]:
df

Unnamed: 0,bigrams,count
0,"(dzu, z)",2749
1,"(r, nr)",17888
2,"(ustawa, z)",1191
3,"(z, dnia)",9527
4,"(r, o)",7099
...,...,...
640760,"(zataja, tytoń)",1
640761,"(tytoń, uzyskany)",1
640762,"(uprawy, rośliny)",1
640763,"(rośliny, tytoniowej)",1


In [73]:
def calculate_pmi(tokens, ngrams, counted_prob):
    pmi = []
    for ngram, count in tqdm.tqdm(ngrams.items()):
        prob = count / len(ngrams)
        # policzyć unigramy i wrzucić do słownika (unigramy i na nich Counter)
        freq = [counted_prob[t] for t in list(ngram)]
        individual_probabilities = math.prod(freq)
        pmi.append(math.log2(prob/individual_probabilities))
    return pmi

In [40]:
# pmi = []
# text_length = len(tokens)
# for bigram, count in tqdm.tqdm(clean_counted_bigrams.items()):
#     prob = count / text_length
#     freq = [tokens.count(t) / text_length for t in list(bigram)]
#     individual_probabilities = math.prod(freq)
#     pmi.append(math.log2(prob/individual_probabilities))

In [41]:
df["pmi"] = calculate_pmi(tokens, clean_counted_bigrams, counted_tokens_prob)

100%|██████████████████████████████| 640765/640765 [00:00<00:00, 1143949.62it/s]


In [42]:
df

Unnamed: 0,bigrams,count,pmi
0,"(dzu, z)",2749,7.233256
1,"(r, nr)",17888,8.291225
2,"(ustawa, z)",1191,6.860328
3,"(z, dnia)",9527,7.387709
4,"(r, o)",7099,6.430835
...,...,...,...
640760,"(zataja, tytoń)",1,16.003935
640761,"(tytoń, uzyskany)",1,13.177965
640762,"(uprawy, rośliny)",1,11.141439
640763,"(rośliny, tytoniowej)",1,18.048329


# Sort the word pairs according to that measure in the descending order and determine top 10 entries.

In [43]:
df.sort_values(by="pmi", ascending=False).head(10)

Unnamed: 0,bigrams,count,pmi
360066,"(recytatorów, dyrygentów)",1,24.633292
190472,"(skoczów, komorowice)",1,24.633292
580921,"(wlewowego, ścinanie)",1,24.633292
580916,"(wykańczającej, odlewy)",1,24.633292
124399,"(alfabetu, łacińskiego)",1,24.633292
580908,"(żeliwo, staliwo)",1,24.633292
625951,"(aldehydu, furfurylowego)",1,24.633292
625950,"(furfuralu, aldehydu)",1,24.633292
580895,"(drążeniu, tuneli)",1,24.633292
199082,"(skrzynka, podawcza)",1,24.633292


# Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).

In [44]:
df_filtered = df[df["count"] >= 5]

In [45]:
top_bigrams_spacy_df = df_filtered.sort_values(by="pmi", ascending=False).head(10)
top_bigrams_spacy_df

Unnamed: 0,bigrams,count,pmi
237292,"(natryskiwania, tworzywami)",5,22.311364
296208,"(obiegów, chłodzących)",5,22.311364
11613,"(ręcznego, miotacza)",5,22.311364
237463,"(młynki, młotkowe)",5,22.311364
237459,"(past, emulsyjnych)",5,22.311364
53414,"(dosiadanie, powożenie)",5,22.311364
237486,"(młyny, kulowe)",5,22.311364
237582,"(ekrany, kinowe)",5,22.311364
53517,"(stajnią, wyścigową)",5,22.311364
237552,"(nieprzejezdne, kołowroty)",5,22.311364


# Use KRNNT or Clarin-PL API(https://ws.clarin-pl.eu/tager.shtml) to tag and lemmatize the corpus.

In [46]:
import xml.etree.ElementTree as ET

In [47]:
# bills_text = ' '.join(bills_dict.values())

In [48]:
# with open('bills_text.txt', 'w') as f:
#     f.write(bills_text)

In [49]:
with open('results.txt', 'r') as f:
    results = f.read()

In [50]:
xml_tree = ET.fromstring(results)

In [78]:
match = regex.compile(r'\w+\b')
tags = []
for tok in xml_tree.iter('tok'):
    for lex in tok.iter('lex'):
        base = lex.find("base").text
        ctag = match.findall(lex.find("ctag").text)[0]
        tags.append(f"{base}:{ctag}")
        break

In [79]:
# <tok>
#     <orth>skupujących</orth>
#     <lex disamb="1"><base>skupować</base><ctag>pact:pl:gen:m1:imperf:aff</ctag></lex>
#     <lex disamb="1"><base>skupywać</base><ctag>pact:pl:gen:m1:imperf:aff</ctag></lex>
# </tok>
# dunno what to do in case like that

In [80]:
counted_tags = dict(Counter(tags))

In [81]:
counted_tags_prob = {tag : count/len(tags) for tag, count in counted_tags.items()}

# Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.

In [82]:
bigrams_clairn = list(nltk.bigrams(tags))

In [83]:
len(bigrams_clairn)

4082470

In [84]:
counted_bigrams_clairn = dict(Counter(bigrams_clairn))

In [85]:
len(counted_bigrams_clairn)

552699

In [86]:
clean_counted_bigrams_clairn = (
    { token : value for token, value in counted_bigrams_clairn.items() 
     if all(elem.split(":")[0].isalpha() for elem in list(token))}
)

In [87]:
len(clean_counted_bigrams_clairn)

436601

In [88]:
df_clairn = pd.DataFrame(clean_counted_bigrams_clairn.items(), columns=["bigrams","count"]) 

In [89]:
df_clairn

Unnamed: 0,bigrams,count
0,"(dzu:ign, z:prep)",2749
1,"(r:ign, nr:subst)",17888
2,"(ustawa:subst, z:prep)",8763
3,"(z:prep, dzień:subst)",11358
4,"(r:ign, o:prep)",7099
...,...,...
436596,"(osłabiać:fin, działanie:subst)",1
436597,"(skażać:pact, podlegać:fin)",1
436598,"(zatajać:fin, tytoń:subst)",1
436599,"(tytoń:subst, uzyskać:ppas)",1


In [90]:
# pmi_clairn = []
# text_length = len(tags)
# for bigram, count in tqdm.tqdm(clean_counted_bigrams_clairn.items()):
#     prob = count / text_length
#     freq = [tags.count(t) / text_length for t in list(bigram)]
#     individual_probabilities = math.prod(freq)
#     pmi_clairn.append(math.log2(prob/individual_probabilities))

In [91]:
df_clairn["pmi"] = calculate_pmi(tags, clean_counted_bigrams_clairn, counted_tags_prob)

100%|███████████████████████████████| 436601/436601 [00:00<00:00, 996817.41it/s]


In [92]:
df_clairn

Unnamed: 0,bigrams,count,pmi
0,"(dzu:ign, z:prep)",2749,7.692695
1,"(r:ign, nr:subst)",17888,8.844000
2,"(ustawa:subst, z:prep)",8763,7.255869
3,"(z:prep, dzień:subst)",11358,7.512643
4,"(r:ign, o:prep)",7099,6.983610
...,...,...,...
436596,"(osłabiać:fin, działanie:subst)",1,11.993311
436597,"(skażać:pact, podlegać:fin)",1,10.112636
436598,"(zatajać:fin, tytoń:subst)",1,14.760851
436599,"(tytoń:subst, uzyskać:ppas)",1,8.374792


In [93]:
df_clairn_filtered = df_clairn[df_clairn["count"] >= 5]

In [94]:
top_bigrams_clairn_df = df_clairn_filtered.sort_values(by="pmi", ascending=False).head(10)
top_bigrams_clairn_df

Unnamed: 0,bigrams,count,pmi
148853,"(Grzegorz:subst, schetyna:ign)",5,22.864138
174789,"(młynek:subst, młotkowy:adj)",5,22.864138
403732,"(teryto:ign, rialnego:ign)",5,22.864138
174785,"(pasta:subst, emulsyjny:adj)",5,22.864138
174626,"(metalizacja:subst, natryskowy:adj)",5,22.864138
323588,"(chrom:subst, sześciowartościowy:adj)",5,22.601104
90594,"(Adam:subst, Mickiewicz:subst)",6,22.601104
163609,"(łańcuchowa:subst, rozszczepienie:subst)",5,22.601104
76309,"(chrześcijanin:subst, baptysta:subst)",6,22.378712
241793,"(środa:subst, wlkp:ign)",5,22.378712


# Compute trigram counts for both corpora and perform the same filtering.

In [95]:
trigrams_clairn = list(nltk.trigrams(tags))
trigrams_spacy = list(nltk.trigrams(tokens))

In [96]:
counted_trigrams_clairn = dict(Counter(trigrams_clairn))
counted_trigrams_spacy = dict(Counter(trigrams_spacy))

In [97]:
clean_counted_trigrams_clairn = (
    { token : value for token, value in counted_trigrams_clairn.items() 
     if all(elem.split(":")[0].isalpha() for elem in list(token))}
)

In [98]:
clean_counted_trigrams_spacy = (
    { token : value for token, value in counted_trigrams_spacy.items() 
     if all(elem.isalpha() for elem in list(token))}
)

In [99]:
df_trigrams_spacy = pd.DataFrame(clean_counted_trigrams_spacy.items(), columns=["trigrams","count"])

In [100]:
df_trigrams_clairn = pd.DataFrame(clean_counted_trigrams_clairn.items(), columns=["trigrams","count"]) 

In [102]:
df_trigrams_spacy["pmi"] = calculate_pmi(tokens, clean_counted_trigrams_spacy, counted_tokens_prob)
df_trigrams_clairn["pmi"] = calculate_pmi(tags, clean_counted_trigrams_clairn, counted_tags_prob)

100%|████████████████████████████| 1270351/1270351 [00:00<00:00, 1393934.22it/s]
100%|█████████████████████████████| 1091328/1091328 [00:01<00:00, 970958.27it/s]


In [103]:
df_trigrams_spacy_filtered = df_trigrams_spacy[df_trigrams_spacy["count"] >= 5]
top_trigrams_spacy_df = df_trigrams_spacy_filtered.sort_values(by="pmi", ascending=False).head(10)
top_trigrams_spacy_df

Unnamed: 0,trigrams,count,pmi
394464,"(topienia, żużla, wielkopiecowego)",5,40.437371
406230,"(stomatologom, pielęgniarkom, położnym)",6,40.214978
1120188,"(rolety, lambrekiny, okienne)",6,39.992586
394457,"(porcelanowe, młyny, kulowe)",5,39.96344
59151,"(domowemu, czyniąc, uciążliwym)",5,39.589374
394231,"(wymienniki, przeponowe, rurowe)",5,39.562902
620297,"(niezamieszkującymi, matką, ojcem)",5,39.437371
485572,"(rozrywkowych, widowiskowych, filmowych)",5,39.340509
124087,"(wójtami, burmistrzami, prezydentami)",5,39.314514
784704,"(akumulatora, ołowiowego, kwasowego)",6,39.214978


In [104]:
df_trigrams_clairn_filtered = df_trigrams_clairn[df_trigrams_clairn["count"] >= 5]
top_trigrams_clairn_df = df_trigrams_clairn_filtered.sort_values(by="pmi", ascending=False).head(10)
top_trigrams_clairn_df

Unnamed: 0,trigrams,count,pmi
351884,"(porcelanowy:adj, młyn:subst, kulowy:adj)",5,39.696097
967584,"(roleta:subst, lambrekin:subst, okienny:adj)",6,39.473705
351891,"(topić:ger, żużel:subst, wielkopiecowy:adj)",5,39.365949
351531,"(wymiennik:subst, przeponowy:adj, rurowy:adj)",7,38.82538
966665,"(szpagat:subst, powróz:subst, linek:subst)",8,38.633087
325091,"(reakcja:subst, łańcuchowa:subst, rozszczepien...",5,37.99249
611867,"(błoto:subst, śnieg:subst, lód:subst)",7,37.970957
391891,"(brzuszny:adj, dur:subst, rzekomy:adj)",5,37.655455
746779,"(ołów:subst, kadm:subst, rtęć:subst)",5,37.655455
391890,"(dur:subst, brzuszny:adj, dur:subst)",5,37.333527


# Devise a method for computing the values, based on the results for bigrams.

`
def calculate_pmi(tokens, ngrams):
    pmi = []
    text_length = len(tokens)
    for ngram, count in tqdm.tqdm(ngram.items()):
        prob = count / text_length
        freq = [tokens.count(t) / text_length for t in list(ngram)]
        individual_probabilities = math.prod(freq)
        pmi.append(math.log2(prob/individual_probabilities))
    return pmi
`

# Create a table comparing the results for copora without and with tagging and lemmatization (separate table for bigrams and trigrams).

In [105]:
pd.concat([top_bigrams_spacy_df.reset_index(drop=True), top_bigrams_clairn_df.reset_index(drop=True)], axis=1)

Unnamed: 0,bigrams,count,pmi,bigrams.1,count.1,pmi.1
0,"(natryskiwania, tworzywami)",5,22.311364,"(Grzegorz:subst, schetyna:ign)",5,22.864138
1,"(obiegów, chłodzących)",5,22.311364,"(młynek:subst, młotkowy:adj)",5,22.864138
2,"(ręcznego, miotacza)",5,22.311364,"(teryto:ign, rialnego:ign)",5,22.864138
3,"(młynki, młotkowe)",5,22.311364,"(pasta:subst, emulsyjny:adj)",5,22.864138
4,"(past, emulsyjnych)",5,22.311364,"(metalizacja:subst, natryskowy:adj)",5,22.864138
5,"(dosiadanie, powożenie)",5,22.311364,"(chrom:subst, sześciowartościowy:adj)",5,22.601104
6,"(młyny, kulowe)",5,22.311364,"(Adam:subst, Mickiewicz:subst)",6,22.601104
7,"(ekrany, kinowe)",5,22.311364,"(łańcuchowa:subst, rozszczepienie:subst)",5,22.601104
8,"(stajnią, wyścigową)",5,22.311364,"(chrześcijanin:subst, baptysta:subst)",6,22.378712
9,"(nieprzejezdne, kołowroty)",5,22.311364,"(środa:subst, wlkp:ign)",5,22.378712


In [106]:
pd.concat([top_trigrams_spacy_df.reset_index(drop=True), top_trigrams_clairn_df.reset_index(drop=True)], axis=1)

Unnamed: 0,trigrams,count,pmi,trigrams.1,count.1,pmi.1
0,"(topienia, żużla, wielkopiecowego)",5,40.437371,"(porcelanowy:adj, młyn:subst, kulowy:adj)",5,39.696097
1,"(stomatologom, pielęgniarkom, położnym)",6,40.214978,"(roleta:subst, lambrekin:subst, okienny:adj)",6,39.473705
2,"(rolety, lambrekiny, okienne)",6,39.992586,"(topić:ger, żużel:subst, wielkopiecowy:adj)",5,39.365949
3,"(porcelanowe, młyny, kulowe)",5,39.96344,"(wymiennik:subst, przeponowy:adj, rurowy:adj)",7,38.82538
4,"(domowemu, czyniąc, uciążliwym)",5,39.589374,"(szpagat:subst, powróz:subst, linek:subst)",8,38.633087
5,"(wymienniki, przeponowe, rurowe)",5,39.562902,"(reakcja:subst, łańcuchowa:subst, rozszczepien...",5,37.99249
6,"(niezamieszkującymi, matką, ojcem)",5,39.437371,"(błoto:subst, śnieg:subst, lód:subst)",7,37.970957
7,"(rozrywkowych, widowiskowych, filmowych)",5,39.340509,"(brzuszny:adj, dur:subst, rzekomy:adj)",5,37.655455
8,"(wójtami, burmistrzami, prezydentami)",5,39.314514,"(ołów:subst, kadm:subst, rtęć:subst)",5,37.655455
9,"(akumulatora, ołowiowego, kwasowego)",6,39.214978,"(dur:subst, brzuszny:adj, dur:subst)",5,37.333527


# Why do we have to filter the bigrams, rather than the token sequence?
# Which method works better for the bigrams and which for the trigrams?
# What types of expressions are discovered by the methods.
# Can you devise a different type of filtering that would yield better results?
