In [20]:
import pandas as pd
import re 
import string
from tqdm.auto import tqdm 
tqdm.pandas()

import logging
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis.gensim_models
import pyLDAvis

from nltk.corpus import stopwords

import plotly.express as px


In [17]:
df_org = pd.read_parquet("../data/processed/pyarrow/UfR_text.parquet")
df = df_org.copy()

In [3]:
stop = stopwords.words('danish')

def preprocessing(s):
  if pd.isna(s): 
    return []
  else:
    #remove \n newline character
    s = re.sub("\\n", ' ', s)
    #remove digits
    s = re.sub('[0-9]', ' ', s)
    # lowercase
    s = s.lower() 
    #remove punctuation
    s = s.translate(str.maketrans('', '', string.punctuation))
    #tokenise
    tokens = s.split()
    #remove empty
    tokens = list(filter(None, tokens))
    # #remove oneletter words
    tokens = [x for x in tokens if len(x)>1]
    # #remove stopwords
    tokens = [item for item in tokens if item not in stop]
    return tokens

df["tokens"] = df["verdict_text"].progress_apply(preprocessing)

100%|██████████| 63915/63915 [04:18<00:00, 247.36it/s]


In [4]:
all_tokens = [x for x in df["tokens"].to_list() if x is not None]

In [5]:
id2word = Dictionary(all_tokens) 
id2word.filter_extremes(no_below=7, no_above=.2, keep_n = None)
corpus = [id2word.doc2bow(doc) for doc in tqdm(all_tokens)]


100%|██████████| 63915/63915 [00:38<00:00, 1658.77it/s]


In [46]:
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

In [47]:
num_topics = 20
lda_model = LdaMulticore(corpus=corpus, num_topics=20, id2word=id2word, passes = 6, iterations = 400, eval_every=1)
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(LDAvis_prepared, f"../data/plots/ldavis_num_topics_{num_topics}.html")
LDAvis_prepared


  default_term_info = default_term_info.sort_values(


In [48]:
get_document_topics = [lda_model.get_document_topics(item) for item in tqdm(corpus)]

100%|██████████| 63915/63915 [02:00<00:00, 531.60it/s] 


In [54]:
df["topics"] = get_document_topics
df_with_topics = pd.merge(df,pd.DataFrame(df["topics"].explode().tolist(), index=df["topics"].explode().index, columns=["topic_id","distribution"]),left_index=True,right_index=True)
df_with_topics["verdict_index"]=df_with_topics.index
df_with_topics = df_with_topics.sort_values(["verdict_index","distribution"],ascending=False).drop_duplicates("verdict_index")
df_with_topics = df_with_topics.rename(columns={"topic_id":"most_prevalent_topic","distribution":"prevalence_of_topic"})

In [9]:
ct_topic_year = df_with_topics.groupby(["year","most_prevalent_topic"]).count()["id_verdict"].reset_index()
verdicts_pr_year = ct_topic_year.groupby("year").sum().rename(columns={"id_verdict":"total_verdicts_in_year"}).reset_index()

In [27]:
ct_topic_year_share = pd.merge(ct_topic_year, verdicts_pr_year.loc[:,["year","total_verdicts_in_year"]], left_on="year", right_on="year")
ct_topic_year_share["topic_proportion"] = ct_topic_year_share["id_verdict"]/ct_topic_year_share["total_verdicts_in_year"]

ct_topic_year_share["year"] = ct_topic_year_share["year"].astype(int)
ct_topic_year_share["most_prevalent_topic"] = (ct_topic_year_share["most_prevalent_topic"].astype(int)+1).astype(str)
# Create bar chart displaying the evolution of headers
fig = px.bar(ct_topic_year_share, x="year",y="topic_proportion",color="most_prevalent_topic")
fig.update_yaxes(tickformat=".0%", range=[0,1])
fig

In [69]:

df_with_topics.to_parquet(f"../data/processed/pyarrow/UfR_with_LDA_topics_num_topics_{num_topics}.parquet")