In [8]:
import os
import re
import spacy 
from spacy import displacy
import json
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

In [9]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    stop_words.update(('appellant', 'respondent', 'plaintiff', 'defendant', 'mr', 'dr', 'mdm', 'court','version', 'hr', 'would', 'case', 'sghc', 'court', 'sgca', 'also', 'first', 'person', 'statement', 'line', 'para', 'fact', 'one', 'may', 'time', 'could'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

In [10]:
# filepath = 'C:/Users/gaoh7/Documents/GitHub/case-outcome-predictor/data/rawish_data/facts.json'
filepath = '../data/rawish_data/facts.json'
print("Preprocessing text")
data = []
with open(filepath, 'r') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)
df = df[['casename', 'facts']]    
df['processed_facts'] = df['facts'].apply(preprocess_text)
df.drop(columns=['facts'], inplace=True)
print(df["processed_facts"])
print("Finished preprocessing text")

Preprocessing text


0       [fact, widow, tan, geok, tee, deceased, sue, c...
1       [fact, surrounding, circumstance, including, a...
2       [background, appellant, french, company, secon...
3       [background, microsoft, adobe, autodesk, compa...
4       [fact, mere, assertion, suffice, exh, said, st...
                              ...                        
8515    [fact, accused, low, sze, song, low, year, old...
8516    [fact, giving, opinion, representation, amount...
8517    [fact, party, karan, bagga, litigant, proceedi...
8518                                                   []
8519    [fact, party, towa, company, incorporated, jap...
Name: processed_facts, Length: 8520, dtype: object
Finished preprocessing text


In [11]:
texts, article = [], []
print("Performing topic modelling")
for fact in df['processed_facts']:
    for word in fact:
        article.append(word)
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
# print(lda_model.show_topics())

best_coherence = -1
best_lda = None
for num_topics in range(5, 26, 5):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=5,
                                                alpha='auto',
                                                per_word_topics=True)
    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

Performing topic modelling
Number of topics: 5, Coherence Score: 0.4455967100032388
Number of topics: 15, Coherence Score: 0.5367434354666969
Number of topics: 25, Coherence Score: 0.5280405881368955
Best Coherence Score: 0.5367434354666969
Best no of topic: 15


In [12]:

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=best_topic,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=5,
                                            alpha='auto',
                                            per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

In [13]:
# filepath = 'C:/Users/gaoh7/Documents/GitHub/case-outcome-predictor/issues.json'
filepath = '../data/prediction_data/issues.json'
print("Preprocessing text")
data = []
with open(filepath, 'r') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)
df = df[['casename', 'issues']]    
df['processed_issues'] = df['issues'].apply(preprocess_text)
df.drop(columns=['issues'], inplace=True)
print(df["processed_issues"])
print("Finished preprocessing text")

Preprocessing text
0       [claim, dismissed, cost, high, decision, fook,...
1       [claim, decision, appeal, brought, background,...
2       [appeal, question, arise, appeal, follows, app...
3       [appeal, assistant, registrar, ground, judgmen...
4       [appeal, january, dismissed, give, reason, evi...
                              ...                        
8515    [issue, sub, issue, arise, consideration, whet...
8516    [claim, conspiracy, defraud, fault, ken, sally...
8517    [issue, relating, defence, justification, qual...
8518                                                   []
8519    [background, dispute, towa, commenced, suit, a...
Name: processed_issues, Length: 8520, dtype: object
Finished preprocessing text


In [14]:
texts, article = [], []
print("Performing topic modelling")
for fact in df['processed_issues']:
    for word in fact:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
# print(lda_model.show_topics())

best_coherence = -1
best_lda = None

for num_topics in range(5, 26, 5):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=5,
                                                alpha='auto',
                                                per_word_topics=True)
    
    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    
    print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_topic = num_topics
print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

Performing topic modelling
Number of topics: 5, Coherence Score: 0.42097859259663306
Number of topics: 15, Coherence Score: 0.4989923642950815
Number of topics: 25, Coherence Score: 0.5152077879327654
Best Coherence Score: 0.5152077879327654
Best no of topic: 25


In [15]:

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=best_topic,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=5,
                                            alpha='auto',
                                            per_word_topics=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

KeyboardInterrupt: 