In [1]:
from utils import read_bills
import regex
import os
from typing import Dict
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Data loading and preparing

In [2]:
current_dir = os.getcwd()
bills_dict = read_bills(f"{current_dir}/data/first_ex_data/ustawy")
bill_names = [regex.match(r'(\d+_\d+)', name).group(1) for name in bills_dict.keys()]
# [\w\s] in the middle in case of smth like this ustawa budżetowa na rok 1997...
years = [regex.search(r'(?<=u\s?s\s?t\s?a\s?w\s?a\s[\w\s]*?z\sdnia\s\d+\s\w+\s)(\d+)', content).group(0) for content in bills_dict.values()]
    
bills = pd.DataFrame({
  "Name": bill_names,
  "Year": years,
  "Content": bills_dict.values()
})  

In [19]:
def get_number_of_additions(content : str) -> int:
    # [eę] in case of smth like ./1998_1118.txt:2) w art. 15 po ust. 5 dodaje sie ust. 6 w brzmieniu:
    return len(regex.findall(r'\b(dodaj[eę])(?:\ssi[eę])?(\s(art)|\s(ust)|\s(pkt)|\s(lit)|\s(§)|\s(tiret))', content))

def get_number_of_removals(content: str) -> int:
    return len(regex.findall(r'(skreśla|uchyla)\ssię\s(art|ust|pkt|lit|§|tiret)', content))

def get_number_of_changes(content: str) -> int:
    # it also catch w art. 8 w ust. 1 zdanie wstępne otrzymuje brzmienie: but I assume that change of sentence also
    # apply to change of the meaning
    return len(regex.findall(r'(?:\s*\b\d*\b\s*)+(?:.?)? otrzymuje brzmienie', content))

### 1. For each bill compute the number of the following amendments present in the bill:
   * addition of a unit (e.g. **dodaje się ust. 5a**),
   * removal of a unit (e.g. **w art. 10 ust. 1 pkt 8 skreśla się**),
   * change of a unit (e.g. **art. 5 otrzymuje brzmienie**).
  
### 2. Note that other types of changes, e.g. **po wyrazach "na dofinansowanie" dodaje się wyrazy " , z zastrzeżeniem art. 21a,"**, must not be included in the result.


In [20]:
bills['Number_of_additions'] = bills.apply(lambda row: get_number_of_additions(row['Content']), axis=1)

In [21]:
bills['Number_of_removals'] = bills.apply(lambda row: get_number_of_removals(row['Content']), axis=1)

In [22]:
bills['Number_of_changes'] = bills.apply(lambda row: get_number_of_changes(row['Content']), axis=1)

In [23]:
bills.head(13)

Unnamed: 0,Name,Year,Content,Number_of_additions,Number_of_removals,Number_of_changes
0,2001_874,2001,dzu z 2001 r nr 81 poz 874 ustawa z dnia 21 c...,0,0,1
1,1996_583,1996,dzu z 1996 r nr 124 poz 583 ustawa z dnia 23 ...,0,1,2
2,2003_1853,2003,dzu z 2003 r nr 189 poz 1853 u s t a w a z dn...,13,1,28
3,1997_753,1997,dzu z 1997 r nr 117 poz 753 ustawa z dnia 21 ...,4,0,5
4,2000_440,2000,dzu z 2000 r nr 39 poz 440 ustawa z dnia 16 m...,0,0,0
5,2004_1375,2006,tekst ustawy ustalony ostatecznie po rozpatrz...,21,11,42
6,2003_597,2003,dzu z 2003 r nr 65 poz 597 u s t a w a z dnia...,1,0,0
7,2003_1884,2006,tekst ustawy ustalony ostatecznie po rozpatrz...,1,0,2
8,2001_1196,2001,dzu z 2001 r nr 111 poz 1196 ustawa z dnia 24...,0,0,0
9,1997_592,1998,dzu z 1998 r nr 98 poz 609 ustawa z dnia 2 li...,0,0,0


### 3. Plot results from point 1 showing how the percentage of amendments of a given type changed in the consecutive years.

In [24]:
bills_yearly = bills.groupby('Year')[['Number_of_additions','Number_of_removals','Number_of_changes']].sum()

In [25]:
bills_yearly.dtypes

Number_of_additions    int64
Number_of_removals     int64
Number_of_changes      int64
dtype: object

In [26]:
bills_yearly

Unnamed: 0_level_0,Number_of_additions,Number_of_removals,Number_of_changes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1993,34,7,64
1994,85,30,77
1995,255,80,351
1996,261,131,502
1997,411,160,692
1998,480,229,923
1999,204,64,251
2000,906,314,1464
2001,1157,296,1474
2002,13,2,30


In [27]:
total = bills_yearly.sum(axis=1)
total

Year
1993     105
1994     192
1995     686
1996     894
1997    1263
1998    1632
1999     519
2000    2684
2001    2927
2002      45
2003    1044
2004     772
2006     228
2007     909
2008     151
2009     346
2010     229
2011     165
2012     110
2013     387
2014     343
2015     369
2016     143
2017     728
2018     483
dtype: int64

In [28]:
bills_yearly["Number_of_additions"] *= 100 / total
bills_yearly["Number_of_removals"] *= 100 / total
bills_yearly["Number_of_changes"] *= 100 / total

bills_yearly = bills_yearly.reset_index()
bills_yearly

Unnamed: 0,Year,Number_of_additions,Number_of_removals,Number_of_changes
0,1993,32.380952,6.666667,60.952381
1,1994,44.270833,15.625,40.104167
2,1995,37.172012,11.661808,51.166181
3,1996,29.194631,14.653244,56.152125
4,1997,32.541568,12.66825,54.790182
5,1998,29.411765,14.031863,56.556373
6,1999,39.306358,12.331407,48.362235
7,2000,33.755589,11.698957,54.545455
8,2001,39.528528,10.112743,50.358729
9,2002,28.888889,4.444444,66.666667


In [29]:
fig = px.bar(bills_yearly, 
             x="Year", 
             y=["Number_of_additions", "Number_of_removals", "Number_of_changes"], 
             title="Percentage of amendments of a given type changed in the consecutive years")
fig.show()

### 4. Compute the total number of occurrences of the word ustawa in any inflectional form (ustawa, ustawie, ustawę, etc.) and all spelling forms (ustawa, Ustawa, USTAWA), excluding other words with the same prefix (e.g. ustawić).
### 5. Compute the total number of occurrences of the same word (same conditions), followed by z dnia expression.
### 6. As above, but not followed by z dnia expression. Is the result correct (result 4 =? result 5 + result 6)?

In [30]:
def get_total_number_of_word(content: str) -> int:
    return len(regex.findall(r'\busta(?:wa|w|wy|wą|wami|wie|wach|wo|wę|wom)\b', content))

def get_total_number_of_word_followed_by_day(content: str) -> int:
    return len(regex.findall(r'\busta(?:wa|w|wy|wą|wami|wie|wach|wo|wę|wom)\b(?=\sz\sdnia\b)', content))

def get_total_number_of_word_not_followed_by_day(content: str) -> int:
    return len(regex.findall(r'\busta(?:wa|w|wy|wą|wami|wie|wach|wo|wę|wom)\b(?!\sz\sdnia\b)', content))

In [31]:
total_number_of_word = sum([get_total_number_of_word(content) for content in bills['Content']])
total_number_of_word

24880

In [32]:
total_number_of_word_followed_by_day = sum([get_total_number_of_word_followed_by_day(content) for content in bills['Content']])
total_number_of_word_followed_by_day

8590

In [33]:
total_number_of_word_not_followed_by_day = sum([get_total_number_of_word_not_followed_by_day(content) for content in bills['Content']])
total_number_of_word_not_followed_by_day

16290

### 7. Compute the total number of occurrences of the word ustawa in any inflectional form, excluding occurrences following o zmianie expression.

In [34]:
def get_total_number_of_word_without_change_following(content: str) -> int:
    return len(regex.findall(r'(?<! o zmianie)\busta(?:wa|w|wy|wą|wami|wie|wach|wo|wę|wom)\b', content))

In [35]:
total_number_of_word_without_change_following = sum([get_total_number_of_word_without_change_following(content) for content in bills['Content']])
total_number_of_word_without_change_following

24880

# 8. Plot results 4-7 using a bar chart.

In [36]:
types = ['total_number_of_word', 
         'total_number_of_word_followed_by_day', 
         'total_number_of_word_not_followed_by_day',
         'total_number_of_word_without_change_following']
values = [total_number_of_word, 
         total_number_of_word_followed_by_day, 
         total_number_of_word_not_followed_by_day,
         total_number_of_word_without_change_following]

fig = go.Figure([go.Bar(x=types, y=values)])
fig.show()