In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import gensim
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
import plotly.graph_objects as go

tqdm.pandas()

## Load data

In [3]:
df = pd.read_parquet("../../data/processed/pyarrow/UfR_text.parquet")


In [4]:
df_associated_laws = pd.read_parquet("../../data/processed/pyarrow/UfR_associated_laws.parquet")
df_kendelse = pd.read_parquet("../../data/processed/pyarrow/UfR_kendelse.parquet")
df_m = pd.merge(df,df_associated_laws,how="outer")
df_m = pd.merge(df_m,df_kendelse)
df_m["year"] = df_m["year"].astype(int)

## Generate TF and TF-share for each year

In [5]:
tf_years_count = []
tf_years_share = []
tokenizer = lambda x: gensim.utils.tokenize(x,lower=True)
for year in tqdm(df_m["year"].unique()):
    vectorizer = CountVectorizer(tokenizer=tokenizer)
    X = vectorizer.fit_transform(df_m.loc[df_m["year"]==year,'verdict_text'])
    tf_year = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names_out())
    tf_year_count = tf_year.sum().rename(year)
    tf_year_share = tf_year.replace(0,np.nan).count()/tf_year.count()
    tf_years_count.append(tf_year_count)
    tf_years_share.append(tf_year_share)

100%|██████████| 155/155 [25:48<00:00,  9.99s/it]


In [None]:
df_term_frequency = pd.concat(tf_years_count,axis=1)
df_term_share = pd.concat(tf_years_share,axis=1)
df_term_frequency = df_term_frequency.fillna(0)
df_term_share = df_term_share.fillna(0)
df_term_share.columns=list(range(1867,2022))

In [None]:
df_term_frequency.to_pickle("../../data/processed/pyarrow/term_frequency.pkl")
df_term_share.to_pickle("../../data/processed/pyarrow/term_document_share.pkl")

In [None]:
df_term_frequency = pd.read_pickle("../../data/processed/pyarrow/term_frequency.pkl")
df_term_share = pd.read_pickle("../../data/processed/pyarrow/term_document_share.pkl")

## count of words over time

In [None]:

fig = px.bar(df_m.groupby("year")["antal_ord"].sum(),template="ggplot2")
fig.update_yaxes(title="Count of words, millions")
fig.update_layout(showlegend=False) 
fig = fig.update_xaxes(showgrid=False)
fig = fig.update_layout(margin=dict(
                        l=0,
                        r=0,
                        b=0,
                        t=0
                        )
                        )
fig.write_html("../../data/plots/count_of_words.html")
fig.write_image("../../data/plots/count_of_words.pdf")
fig

## Word count over custom time interval

In [None]:
def assign_interval(year):
    interval = [1867, 1920, 1950, 1980, 1995, 2010, 2022]
    interval_range = [range(x[0],x[1]) for x in zip(interval[:-1],interval[1:])] 
    for x in interval_range:
        if year in x:
            return x.start

df_m["custom_year_interval"] = df_m["year"].astype(int).apply(assign_interval)  
df_m["antal_ord"] = df_m["verdict_text"].progress_apply(lambda x: len(x.split(" ")) )



100%|██████████| 63915/63915 [00:22<00:00, 2804.37it/s] 


In [None]:
df_m.groupby("custom_year_interval")["antal_ord"].agg(["sum","count"]).rename(columns={"sum":"count of words","count":"Count of documents"})



Unnamed: 0_level_0,count of words,Count of documents
custom_year_interval,Unnamed: 1_level_1,Unnamed: 2_level_1
1867,10118311,10617
1920,14189946,16106
1950,21265434,14097
1980,10875548,6374
1995,29138319,9846
2010,37171902,6875


## Create table kendelse/ikke-kendelse for law association 

In [None]:
df_relevante_love_top_x = df_m.explode("relevante_love_cleaned").groupby("relevante_love_cleaned").count().sort_values("id_verdict",ascending=False)["id_verdict"][:20]
love_top_x = list(df_relevante_love_top_x.reset_index()["relevante_love_cleaned"])
df_relevante_love = df_m.copy()
df_relevante_love.dropna(inplace=True)
df_relevante_love["relevante_love_cleaned"] = df_relevante_love["relevante_love_cleaned"].apply(lambda love: [lov if lov in love_top_x else "anden_lov" for lov in love] if love is not None else None)
df_relevante_love = pd.merge(df_m.drop(columns=["relevante_love_cleaned"]),df_relevante_love.loc[:,["id_verdict","relevante_love_cleaned"]],left_on="id_verdict",right_on="id_verdict", how="outer")


In [None]:
has_association = ~df_relevante_love["relevante_love_cleaned"].isna()
df_relevante_love_total_only_relevant_laws_included = df_relevante_love.loc[has_association] \
                                            .groupby("not_kendelse") \
                                            .agg({"id_verdict":"count","antal_ord":"mean"}) \
                                            .rename(columns={"id_verdict":"count of judgements","antal_ord":"avg_number_of_words"}) \
                                            .transpose() 
df_relevante_love_total_only_relevant_laws_included

not_kendelse,False,True
count of judgements,8025.0,20275.0
avg_number_of_words,1376.596636,3224.030875


In [None]:
df_relevante_love_total_1950_2021 = df_relevante_love.loc[df_relevante_love["year"].astype(int)>1949] \
                                            .groupby("not_kendelse") \
                                            .agg({"id_verdict":"count","antal_ord":"mean"}) \
                                            .rename(columns={"id_verdict":"count of judgements","antal_ord":"avg_number_of_words"}) \
                                            .transpose() 
df_relevante_love_total_1950_2021

not_kendelse,False,True
count of judgements,9375.0,27817.0
avg_number_of_words,1321.509333,3093.86537


In [None]:
df_total = df_relevante_love \
                                            .groupby("not_kendelse") \
                                            .agg({"id_verdict":"count","antal_ord":"mean"}) \
                                            .rename(columns={"id_verdict":"count of judgements","antal_ord":"avg_number_of_words"}) \
                                            .transpose() 
df_total

not_kendelse,False,True
count of judgements,10706.0,53209.0
avg_number_of_words,1230.071735,2059.619839


In [None]:
relevante_love_exploded_pivot = df_relevante_love.explode("relevante_love_cleaned") \
                                            .loc[:,["relevante_love_cleaned","antal_ord","not_kendelse","id_verdict"]] \
                                            .groupby(["relevante_love_cleaned","not_kendelse"]).agg({"id_verdict":"count","antal_ord":"mean"}).reset_index() \
                                            .rename(columns={"id_verdict":"count of judgements","antal_ord":"avg_number_of_words"}) \
                                            .pivot(index="relevante_love_cleaned",columns="not_kendelse") 

# relevante_love_exploded_pivot.append(relevante_love_exploded_total)
relevante_love_exploded_pivot

Unnamed: 0_level_0,count of judgements,count of judgements,avg_number_of_words,avg_number_of_words
not_kendelse,False,True,False,True
relevante_love_cleaned,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
aftaleloven,23,445,2995.913043,5569.166292
anden_lov,1801,8771,2070.602998,4703.30236
erstatningsansvarsloven,10,603,2181.3,5850.296849
forsikringsaftaleloven,11,388,2342.545455,3366.427835
funktionærloven,11,676,2465.090909,3424.204142
færdselsloven,172,2185,1418.872093,1404.584439
grundloven,23,276,3513.304348,7084.221014
indkomst og formueskat til staten,12,757,1176.166667,3253.003963
kildeskatteloven,10,204,914.2,3735.980392
konkurrenceloven,5,264,2390.8,4697.905303


In [None]:
# print("Alle domme (antal):"a)
# print(f"{sum(df['nægtet_sig_skyldig'])=}")
# print(f"{sum(df['thi_kendes_for_ret'])=}")
# print("")
# print("Efter 1950 domme (antal):")
# print(f"{sum(df.loc[df['year'].astype(int)>=1950,'nægtet_sig_skyldig'])=}")
# print(f"{sum(df.loc[df['year'].astype(int)>=1950,'thi_kendes_for_ret'])=}")
# print("")
# print("Thi kendes for ret opdelt på domsstol:")
# print(df.groupby(["thi_kendes_for_ret","document_category"])["id_verdict"].count().reset_index().set_index("document_category").pivot(columns={"thi_kendes_for_ret"}))
# print("")
# print("Thi kendes for ret, antal:")
# print(df.groupby("thi_kendes_for_ret_antal").count()["id_verdict"])

In [None]:
dct = gensim.corpora.Dictionary(df["tokenized_text"])
bow_corpus = [dct.doc2bow(doc, allow_update=True) for doc in tqdm(df["tokenized_text"])]
token_id = dict(dct.token2id)
model = gensim.models.tfidfmodel.TfidfModel(bow_corpus)
id_token = {v: k for k, v in token_id.items()}
bow_with_names = [{id_token[x]:y for x,y in doc} for doc in tqdm(bow_corpus) ]
term_frequency_dict = dict(zip(token_id.keys(),[0 for _ in token_id.keys()]))
for doc in bow_with_names:
    for key in doc.keys():
        term_frequency_dict[key]+=doc[key]
df_bow = pd.DataFrame.from_dict(term_frequency_dict, orient='index')
df_bow = df_bow.sort_values(by=0,ascending=False)