In [38]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()


from nltk.corpus import stopwords


#### Prepare data

In [39]:
df_label = pd.read_parquet("../../data/processed/pyarrow/code_sagsomkostninger.parquet")
danish_stopwords = stopwords.words('danish')
#remove more data
df_label = df_label.loc[~df_label["LABEL_party_who_pays"].isna()]
df_ufr = pd.read_parquet("../../data/processed/pyarrow/UfR_text.parquet")
df = pd.merge(df_label.loc[:,["id_verdict","LABEL_party_who_pays","omkostninger"]],df_ufr)

#### Find criminal cases

In [40]:
import re
def straffesag(s):
    if re.search(("rigsadvokat|anklagemyndighed"),s,re.IGNORECASE):
        return True
    else:
        return False

df_label["straffesag"] = df_label["prosecutor"].apply(straffesag)
df = pd.merge(df,df_label.loc[:,["id_verdict","defendant","prosecutor","straffesag"]], left_on = "id_verdict", right_on = "id_verdict")


#### Produce descriptive cross-table

In [97]:
df.groupby(["straffesag","LABEL_party_who_pays"]).count().to_excel("../../data/tables/label_descriptives.xlsx")

In [99]:
df = df.merge(df_ufr.loc[:,["id_verdict","document_category"]])
df_gb = df.groupby(["document_category","straffesag","LABEL_party_who_pays"]).count()
df_gb.pivot_table(index=["document_category","LABEL_party_who_pays"],columns="straffesag",values="id_verdict").to_excel("../../data/tables/label_distribution_by_court.xlsx")

In [100]:
df_gb

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id_verdict,omkostninger,verdict_text,title,href,year,html_concat,defendant,prosecutor
document_category,straffesag,LABEL_party_who_pays,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Byretterne,False,DEF,19,19,19,19,19,19,19,19,19
Byretterne,False,NO_COST,24,24,24,24,24,24,24,24,24
Byretterne,False,PROC,9,9,9,9,9,9,9,9,9
Højesteret,False,DEF,1407,1407,1407,1407,1407,1407,1407,1407,1407
Højesteret,False,NO_COST,1318,1318,1318,1318,1318,1318,1318,1318,1318
Højesteret,False,PROC,3750,3750,3750,3750,3750,3750,3750,3750,3750
Højesteret,True,DEF,1019,1019,1019,1019,1019,1019,1019,1019,1019
Højesteret,True,NO_COST,1,1,1,1,1,1,1,1,1
Højesteret,True,PROC,840,840,840,840,840,840,840,840,840
Sø- og Handelsretten,False,DEF,320,320,320,320,320,320,320,320,320


#### Produce spreadsheet for manual encoding

In [42]:
df_gb = df.groupby(["straffesag","LABEL_party_who_pays"]).apply(lambda x: x.sample(frac=0.05))
df_gb = df_gb.drop(columns=["straffesag","LABEL_party_who_pays"]).reset_index().loc[:,["id_verdict","omkostninger","prosecutor","defendant","straffesag","verdict_text"]]
df_gb = df_gb.sample(frac=1)
df_gb.to_excel("manual_coding_of_labels.xlsx")

#### Evaluate manual encoding

In [52]:
df_encoded = pd.read_csv("../../data/tables/encoded-manual-labels.csv")
df_encoded = df_encoded.merge(df.loc[:,["id_verdict","LABEL_party_who_pays"]])

In [115]:
df_encoded["Right_classification"] = df_encoded["Hvem betaler?"] == df_encoded["LABEL_party_who_pays"]
print(len(df_encoded)-len(df_encoded.loc[df_encoded["Right_classification"]==False]))
print(1-len(df_encoded.loc[df_encoded["Right_classification"]==False])/len(df_encoded))

686
0.9884726224783862


In [None]:
df_encoded.to_csv("../../data/tables/manual_encoded_labels.csv")

In [114]:
df_all_rulings = df.loc[~((df["straffesag"] == True) & (df["LABEL_party_who_pays"]=="NO_COST"))].reset_index(drop=True)
df_all_rulings.loc[:,["id_verdict","LABEL_party_who_pays"]].to_csv("../../data/tables/classified_rulings.csv")