In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import plotly.express as px
from tqdm.auto import tqdm
import string

## Load data

In [3]:
df_thi_kendes_for_ret = pd.read_parquet("../../data/processed/pyarrow/UfR_thi_kendes_for_ret_partition.parquet")
df_html_gross = pd.read_parquet("../../data/processed/pyarrow/UfR_text.parquet")
df = pd.merge(df_thi_kendes_for_ret, df_html_gross[["id_verdict","verdict_text","html_concat"]])

## Investigate headers


In [4]:
#Clean headers
df["headers_cleaned"] = df["headers"].apply(lambda x: [y.lower().translate(str.maketrans('', '', string.punctuation)) for y in x])

#Merge cleaned headers onto which year the document is from
a = pd.merge(df["headers_cleaned"].explode(),df["year"], left_index=True,right_index=True)
#Count headers (total)
gb = a.groupby("headers_cleaned").count()

# Count headers
headers_over_time = a.reset_index().groupby(["year","headers_cleaned"]).count().reset_index()

#Remove infrequent (and too frequent) headers
frequent_headers = list(gb[(gb["year"]<40000)&(gb["year"]>500)].reset_index()["headers_cleaned"])
headers_over_time_frequent_headers = headers_over_time[headers_over_time.reset_index()["headers_cleaned"].isin(frequent_headers)] 

# Summarize header count pr. year
headers_over_time_frequent_headers_count_years = headers_over_time_frequent_headers.loc[:,["year","index"]].groupby("year").sum().rename(columns={"index":"count_year"})

# Distribution of frequent headers pr. year (to make 100pct. stacked bar chart)
HoT_rel = pd.merge(headers_over_time_frequent_headers,headers_over_time_frequent_headers_count_years, left_on = "year", right_on = "year")
HoT_rel["header_shares_pr_year"] = HoT_rel["index"] / HoT_rel["count_year"]

In [5]:
# Create bar chart displaying the evolution of headers
fig = px.bar(HoT_rel, x="year",y="header_shares_pr_year",color="headers_cleaned")
fig.update_yaxes(tickformat=".0%", range=[0,1])

In [6]:
import gensim
import string
tqdm.pandas()
df["thi_kendes_for_ret_string"] = df["thi_kendes_for_ret_text"].apply(lambda x: " ".join(x) if x is not None else None )
def tokenize_string(s):
    if s is not None: 
        try: 
            return list(gensim.utils.tokenize(s, lowercase=True, deacc=True))
        except UnicodeDecodeError:
            return []
    else:
        return [] 

df["tokenized_text_thi_kendes_for_ret"] = df["thi_kendes_for_ret_string"].progress_apply(tokenize_string)

dct = gensim.corpora.Dictionary(df["tokenized_text_thi_kendes_for_ret"])
bow_corpus = [dct.doc2bow(doc, allow_update=True) for doc in tqdm(df["tokenized_text_thi_kendes_for_ret"])]
token_id = dict(dct.token2id)
model = gensim.models.tfidfmodel.TfidfModel(bow_corpus)
id_token = {v: k for k, v in token_id.items()}
bow_with_names = [{id_token[x]:y for x,y in doc} for doc in tqdm(bow_corpus) ]
term_frequency_dict = dict(zip(token_id.keys(),[0 for _ in token_id.keys()]))
for doc in bow_with_names:
    for key in doc.keys():
        term_frequency_dict[key]+=doc[key]
df_bow = pd.DataFrame.from_dict(term_frequency_dict, orient='index')
df_bow = df_bow.sort_values(by=0,ascending=False)
df_bow = df_bow.reset_index()

100%|██████████| 63915/63915 [00:05<00:00, 11769.27it/s]
100%|██████████| 63915/63915 [00:02<00:00, 22253.09it/s]
100%|██████████| 63915/63915 [00:00<00:00, 218139.23it/s]


In [7]:
'Påstande', 'Sagens oplysninger','Rettens begrundelse og afgørelse', 'Strafudmålingen', 'Thi kendes for ret'

('Påstande',
 'Sagens oplysninger',
 'Rettens begrundelse og afgørelse',
 'Strafudmålingen',
 'Thi kendes for ret')

In [8]:
def remove_header_items(header):

    r = re.compile("\\r*")
    headers_cleaned = list(filter(r.match, header))
    return headers_cleaned

In [9]:
df["headers_cleaned"] = df["headers"].apply(remove_header_items)

In [30]:
df.loc[df.loc[:,"tokenized_text_thi_kendes_for_ret"].apply(lambda x: True if "kendes" in x else False),"tokenized_text_thi_kendes_for_ret"].iloc[-10:].iloc[1]

['de',
 'sagsøgte',
 'b',
 'og',
 'foreningen',
 'anima',
 'frifindes',
 'b',
 'og',
 'foreningen',
 'animas',
 'udtalelser',
 'i',
 'brev',
 'af',
 'februar',
 'om',
 'nu',
 'er',
 'der',
 'kommet',
 'nye',
 'afsløringer',
 'fra',
 'danske',
 'minkfarme',
 'optagelserne',
 'viser',
 'at',
 'selv',
 'toppen',
 'af',
 'de',
 'danske',
 'minkavlere',
 'er',
 'ansvarlige',
 'for',
 'grov',
 'mishandling',
 'pa',
 'medlemmer',
 'i',
 'bestyrelsen',
 'for',
 'kopenhagen',
 'fur',
 'blev',
 'grusom',
 'dyremishandling',
 'afdækket',
 'kendes',
 'ubeføjede',
 'jf',
 'straffelovens',
 'b',
 'og',
 'foreningen',
 'anima',
 'tilpligtes',
 'at',
 'betale',
 'kr',
 'til',
 'dækning',
 'af',
 'omkostningerne',
 'ved',
 'kundgørelse',
 'i',
 'en',
 'eller',
 'flere',
 'offentlige',
 'aviser',
 'af',
 'domsslutningen',
 'og',
 'domsgrundene',
 'b',
 'og',
 'foreningen',
 'anima',
 'tilpligtes',
 'in',
 'solidum',
 'at',
 'betale',
 'x',
 'vildtfarm',
 'v',
 'y',
 'en',
 'godtgørelse',
 'pa',
 'kr',
 