In [2]:
import pandas as pd
import glob
from tqdm.auto import tqdm
import string
from bs4 import BeautifulSoup
import re
import plotly.express as px
import string
import pyarrow as pa
import pyarrow.parquet as pq

tqdm.pandas()

In [3]:
filelist = []
dfs = [pd.read_parquet(f) for f in glob.glob("../../data/processed/pyarrow/UfR_document_relations/*.parquet")]   
df = pd.concat(dfs) 


In [4]:
def structureResponse(html:str) -> dict:
    soup = BeautifulSoup(html, features="lxml")
    divs = soup.find_all("div")
    dictionary = parseHtmlDataStructure(divs)
    
    return dictionary

def parseHtmlDataStructure(divs):
    result = {}
    if divs == []: return result
    for div in divs:
        if div.find("h2") is not None:
            header = next(div.find("h2").stripped_strings)
        else:
            header = None
        result[header]=parseHtmlTreeStructures(div.find("ul",{"class":"tree"},recursive=False))
        if not result[header]:
            result[header]=parseHtmlDocumentElementStructures(div.find("ul",recursive=False))

    return result

def parseHtmlTreeStructures(ul):
    result = {}
    if not ul: return None
    for li in ul.find_all("li", recursive=False):
        key = next(li.stripped_strings)
        ul = li.find("ul")
        if ul:
            if key not in result:
                result[key] = parseHtmlTreeStructures(ul)
            else:
                result[key].update(parseHtmlTreeStructures(ul))
        else:
            result[key] = None
    return result

def parseHtmlDocumentElementStructures(ul):
    result = {}
    if not ul: return None
    for li in ul.find_all("li", recursive=False):
        if li["rel"] == "document":
            document_key = next(li.stripped_strings)
            continue
        elif li["rel"] == "element":
            element_key = next(li.stripped_strings)
            if document_key not in result:
                result[document_key] = {element_key : None}
            else:
                result[document_key].update({element_key:None})
    
    return result

def cleanLovregister(lovregister):
    if isinstance(lovregister,dict):
        for key in lovregister.keys():
            if type(lovregister[key]) is dict:
                lovregister[key] = lovregister[key].keys()
    return lovregister

def storeDataframe(df, path):
    """Store DF using the Apache Arrow package

    Args:
        df (pd.DataFrame): Pandas DataFrame
        path (str): Where to store .parquet file (including name of file) 
    """
    table = pa.Table.from_pandas(df)
    pq.write_table(table,path)

df["relation_dict"] = df["relation_html"].progress_apply(lambda x: structureResponse(x) if not None else None)
df["lovregister_dict"] = df["relation_dict"].progress_apply(lambda x: x["Lovregister"] if "Lovregister" in x else None)
df["lovregister_dict_cleaned"] = df["lovregister_dict"].apply(cleanLovregister)
df["relevante_love"] = df["lovregister_dict_cleaned"].progress_apply(lambda x: list(x.values()) if isinstance(x,dict) else None)
df["relevante_love"] = df["relevante_love"].progress_apply(lambda x: [item for sublist in x for item in sublist] if x is not None else None)
df_org = df.copy()

100%|██████████| 41452/41452 [02:55<00:00, 236.72it/s]
100%|██████████| 41452/41452 [00:00<00:00, 781593.24it/s]
100%|██████████| 41452/41452 [00:00<00:00, 753652.21it/s]
100%|██████████| 41452/41452 [00:00<00:00, 176366.70it/s]


In [5]:
def ikkeKendelse(li):
    for header in li:
        if re.search("\.?K\.",header):
            return False
    if len(li)!=0:
        return True
    else:
        return None
def clean_string(s:list):
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = s.lower()
    if s == "leje": s = "lejeloven"
    return s
df_headers = pd.read_parquet("../../data/processed/pyarrow/UfR_thi_kendes_for_ret_partition.parquet") 
df_headers = df_headers.loc[:,["id_verdict","headers"]].copy()
df_headers = pd.merge(df_headers["id_verdict"], df_headers["headers"].apply(ikkeKendelse), left_index=True, right_index=True)
df_headers.rename(columns={"headers":"not_kendelse"}, inplace=True)
df_m = pd.merge(df,df_headers,left_on="id_verdict", right_on="id_verdict")
df = pd.merge(df,df_headers,left_on="id_verdict", right_on="id_verdict")
df["relevante_love_cleaned"] = df["relevante_love"].apply(lambda l: [clean_string(s) for s in l] if l else None)
df_org = df.copy()

In [35]:
df_org.loc[:,["id_verdict","relevante_love_cleaned"]].to_parquet("../../data/processed/pyarrow/UfR_associated_laws.parquet")

In [19]:
df = df_org.copy()
#FIND THE X MOST ASSOCIATED LAWS
df_relevante_love_top_x = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].explode("relevante_love_cleaned").groupby("relevante_love_cleaned").count().sort_values("id_verdict",ascending=False)["id_verdict"][:15]
love_top_x = list(df_relevante_love_top_x.reset_index()["relevante_love_cleaned"])

df_top_x_lov = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].copy()
df_top_x_lov["relevante_love_cleaned"] = df_top_x_lov["relevante_love_cleaned"].apply(lambda love: [lov if lov in love_top_x else "Other law" for lov in love] if love is not None else None)

df_relevante_love = df_top_x_lov.explode("relevante_love_cleaned").groupby(["relevante_love_cleaned","year"]).count().sort_values("id_verdict",ascending=False)["id_verdict"]
df_relevante_love = df_relevante_love.reset_index()
df_relevante_love.rename(columns={"id_verdict":"count","relevante_love_cleaned":"Associated law"}, inplace=True)

df_relevante_love["year"] = df_relevante_love["year"].astype(int)
df_relevante_love.sort_values(["year","count"],inplace=True,ascending=False)
df_relevante_love['share'] = df_relevante_love['count'] / df_relevante_love.groupby('year')['count'].transform('sum') 
fig = px.bar(df_relevante_love, 
            x="year",
            y="share",
            color="Associated law",
            template="ggplot2",
            color_discrete_sequence=px.colors.qualitative.Pastel,
            hover_name="Associated law", 
            hover_data= {"count":":.0f",
                         "share":":.1%", 
                         "Associated law":False})
fig.update_yaxes(tickformat=".0%", 
                range=[0,1])
fig.update_layout(yaxis_title="Share (pct.)",
                  xaxis_title="Year")
fig = fig.update_layout(margin=dict(
                        l=0,
                        r=0,
                        b=0,
                        t=0
                        )
                        )
fig.write_html("../../data/plots/distribution_of_court_documents_by_associated_law.html")
fig.write_image("../../data/plots/distribution_of_court_documents_by_associated_law.pdf")
fig

In [20]:
df = df_org.copy()
df = df.loc[df["not_kendelse"] == True]

#FIND THE X MOST ASSOCIATED LAWS
top_n = 15
df_relevante_love_top_x = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].explode("relevante_love_cleaned").groupby("relevante_love_cleaned").count().sort_values("id_verdict",ascending=False)["id_verdict"][:top_n]
love_top_x = list(df_relevante_love_top_x.reset_index()["relevante_love_cleaned"])

df_top_x_lov = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].copy()
df_top_x_lov["relevante_love_cleaned"] = df_top_x_lov["relevante_love_cleaned"].apply(lambda love: [lov if lov in love_top_x else "Other law" for lov in love] if love is not None else None)

df_relevante_love = df_top_x_lov.explode("relevante_love_cleaned").groupby(["relevante_love_cleaned","year"]).count().sort_values("id_verdict",ascending=False)["id_verdict"]
df_relevante_love = df_relevante_love.reset_index()
df_relevante_love.rename(columns={"id_verdict":"count","relevante_love_cleaned":"Associated law"}, inplace=True)

df_relevante_love["year"] = df_relevante_love["year"].astype(int)
df_relevante_love.sort_values(["year","count"],inplace=True,ascending=False)
df_relevante_love['share'] = df_relevante_love['count'] / df_relevante_love.groupby('year')['count'].transform('sum') 
fig = px.bar(df_relevante_love, 
            x="year",
            y="share",
            color="Associated law",
            template="ggplot2",
            color_discrete_sequence=px.colors.qualitative.Pastel,
            hover_name="Associated law", 
            hover_data= {"count":":.0f",
                         "share":":.1%", 
                         "Associated law":False})
fig.update_yaxes(tickformat=".0%", 
                range=[0,1])
fig.update_layout(yaxis_title="Share (pct.)",
                  xaxis_title="Year")
fig = fig.update_layout(margin=dict(
                        l=0,
                        r=0,
                        b=0,
                        t=0
                        )
                        )
fig.write_html("../../data/plots/distribution_of_judgements_by_associated_law.html")
fig.write_image("../../data/plots/distribution_of_judgements_by_associated_law.pdf")
fig

In [21]:
df = df_org.copy()
df = df.loc[df["not_kendelse"] == False]

#FIND THE X MOST ASSOCIATED LAWS
df_relevante_love_top_x = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].explode("relevante_love_cleaned").groupby("relevante_love_cleaned").count().sort_values("id_verdict",ascending=False)["id_verdict"][:15]
love_top_x = list(df_relevante_love_top_x.reset_index()["relevante_love_cleaned"])

df_top_x_lov = df.loc[:,["id_verdict","year","relevante_love_cleaned"]].copy()
df_top_x_lov["relevante_love_cleaned"] = df_top_x_lov["relevante_love_cleaned"].apply(lambda love: [lov if lov in love_top_x else "Other law" for lov in love] if love is not None else None)

df_relevante_love = df_top_x_lov.explode("relevante_love_cleaned").groupby(["relevante_love_cleaned","year"]).count().sort_values("id_verdict",ascending=False)["id_verdict"]
df_relevante_love = df_relevante_love.reset_index()
df_relevante_love.rename(columns={"id_verdict":"count","relevante_love_cleaned":"Associated law"}, inplace=True)

df_relevante_love["year"] = df_relevante_love["year"].astype(int)
df_relevante_love.sort_values(["year","count"],inplace=True,ascending=False)
df_relevante_love['share'] = df_relevante_love['count'] / df_relevante_love.groupby('year')['count'].transform('sum') 
fig = px.bar(df_relevante_love, 
            x="year",
            y="share",
            color="Associated law",
            template="ggplot2",
            color_discrete_sequence=px.colors.qualitative.Pastel,
            hover_name="Associated law", 
            hover_data= {"count":":.0f",
                         "share":":.1%", 
                         "Associated law":False})
fig.update_yaxes(tickformat=".0%", 
                range=[0,1])
fig.update_layout(yaxis_title="Share (pct.)",
                  xaxis_title="Year")
fig = fig.update_layout(margin=dict(
                        l=0,
                        r=0,
                        b=0,
                        t=0
                        )
                        )
fig.write_html("../../data/plots/distribution_of_court_decisions_by_associated_law.html")
fig.write_image("../../data/plots/distribution_of_court_decisions_by_associated_law.pdf")
fig

In [9]:
len(df_org[df_org["relevante_love_cleaned"].isna()])/len(df_org)

0.3172826401621152

In [14]:
len(df_org[~df_org["relevante_love_cleaned"].isna()])

28300

In [15]:
28300/len(df_org)

0.6827173598378847

In [12]:
len(df_org[df_org["relevante_love_cleaned"].isna()])/len(df_org)

0.3172826401621152