In [1]:
import os
import re
import spacy 
from spacy import displacy
import json
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the JSON file into a dictionary
with open('../data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
issues_df = pd.DataFrame(issues_data)

# Load the JSON file into a dictionary
with open('../data/rawish_data/facts.json') as f:
    facts_data = [json.loads(line) for line in f]
raw_facts_df = pd.DataFrame(facts_data)

# Merge DataFrames
processed_df = pd.merge(raw_facts_df, issues_df, on='casename', how='outer')

# Display the resulting DataFrame
print(processed_df.head())

           casename                                              facts  \
0   2000_SGCA_1.pdf  The facts\nThe appellant is the widow of one T...   
1  2000_SGCA_10.pdf  facts and surrounding circumstances including ...   
2  2000_SGCA_11.pdf  Background \nThe first appellants, a French co...   
3  2000_SGCA_12.pdf  Background\nMicrosoft, Adobe and Autodesk are ...   
4  2000_SGCA_13.pdf  facts. Mere assertion would not suffice. In ex...   

                                              issues  
0  The claim was dismissed with costs by the\nHig...  
1  the claim and\nagainst that decision this appe...  
2  The appeal \nThe questions which arise in this...  
3  the appeals from the assistant registrar. In h...  
4  the appeal on 24 January 2000 and dismissed it...  


In [4]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    legal_stopwords = ('appellant', 'respondent', 'plaintiff', 'defendant', 'mr', 'mrs', 'dr', 'mdm', 'court','version', 'hr', 'would', 'case', 'sghc', 'court', 'sgca', 'slr', 'sgdc', 'also', 'first', 'person', 'statement', 'line', 'para', 'fact', 'one', 'may', 'time', 'could', 'next', 'legal', 'issues', 'issue')
    stop_words.update(legal_stopwords)
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

In [17]:
data = []

processed_df['processed_facts'] = processed_df['facts'].apply(preprocess_text)
processed_df.drop(columns=['facts'], inplace=True)
print(processed_df["processed_facts"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for fact in processed_df['processed_facts']:
    for word in fact:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

#initialize parameters
best_coherence = -1
best_lda = None
for num_topics in range(3, 31, 1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42)
    
    # Get coherence score for each loop
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 10 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=best_topic,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)

processed_df['facts_topic'] = topics
processed_df = processed_df.reset_index(drop=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [fact, widow, tan, geok, tee, deceased, sue, c...
1       [fact, surrounding, circumstance, including, a...
2       [background, appellant, french, company, secon...
3       [background, microsoft, adobe, autodesk, compa...
4       [fact, mere, assertion, suffice, exh, said, st...
                              ...                        
8515    [fact, accused, low, sze, song, low, year, old...
8516    [fact, giving, opinion, representation, amount...
8517    [fact, party, karan, bagga, litigant, proceedi...
8518                                                   []
8519    [fact, party, towa, company, incorporated, jap...
Name: processed_facts, Length: 8520, dtype: object
Finished preprocessing text
Performing topic modelling
Number of topics: 3, Coherence Score: 0.39324704976329333
Number of topics: 4, Coherence Score: 0.42437353670658984
Number of topics: 5, Coherence Score: 0.4170010346025063
Number of topics: 6, Coherence Score: 0.39440271026639556
Number of topics: 7, Cohe

In [5]:
#Same concept as above
data = []
processed_df['processed_issues'] = processed_df['issues'].apply(preprocess_text)
processed_df.drop(columns=['issues'], inplace=True)
print(processed_df["processed_issues"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for issue in processed_df['processed_issues']:
    for word in issue:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

best_coherence = -1
best_lda = None
for num_topics in range(3, 31, 1):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 25 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=best_topic,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)
    
processed_df['issues_topic'] = topics
processed_df = processed_df.reset_index(drop=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [claim, dismissed, cost, high, decision, fook,...
1       [claim, decision, appeal, brought, background,...
2       [appeal, question, arise, appeal, follows, app...
3       [appeal, assistant, registrar, ground, judgmen...
4       [appeal, january, dismissed, give, reason, evi...
                              ...                        
8515    [sub, arise, consideration, whether, low, siva...
8516    [claim, conspiracy, defraud, fault, ken, sally...
8517    [relating, defence, justification, qualified, ...
8518                                                   []
8519    [background, dispute, towa, commenced, suit, a...
Name: processed_issues, Length: 8520, dtype: object
Finished preprocessing text
Performing topic modelling
Number of topics: 3, Coherence Score: 0.34343459736129206
Number of topics: 4, Coherence Score: 0.3511693705252339
Number of topics: 5, Coherence Score: 0.3545368694872686
Number of topics: 6, Coherence Score: 0.3614588493271913
Number of topics: 7, Coher

In [7]:
processed_df.drop(columns=['processed_facts'], inplace=True)
processed_df.drop(columns=['processed_issues'], inplace=True)
processed_df.to_csv("issues_facts_topics.csv")