In [28]:
import os
import re
import spacy 
from spacy import displacy
import json
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

import pyLDAvis
import pyLDAvis.gensim_models

In [36]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words



In [38]:
filepath = 'C:/Users/gaoh7/Documents/GitHub/case-outcome-predictor/data/rawish_data/facts.json'
print("Preprocessing text")
data = []
with open(filepath, 'r') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)
df = df[['casename', 'facts']]    
df['processed_facts'] = df['facts'].apply(preprocess_text)
df.drop(columns=['facts'], inplace=True)
print(df["processed_facts"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for fact in df['processed_facts']:
    for word in fact:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
print(lda_model.show_topics())
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

Preprocessing text
0       [fact, appellant, widow, one, tan, geok, tee, ...
1       [fact, surrounding, circumstance, including, a...
2       [background, first, appellant, french, company...
3       [background, microsoft, adobe, autodesk, compa...
4       [fact, mere, assertion, would, suffice, exh, p...
                              ...                        
8515    [fact, first, accused, low, sze, song, low, ye...
8516    [fact, giving, opinion, representation, would,...
8517    [fact, party, plaintiff, mr, karan, bagga, lit...
8518                                                   []
8519    [fact, party, plaintiff, towa, company, incorp...
Name: processed_facts, Length: 8520, dtype: object
Finished preprocessing text
Performing topic modelling
[(0, '0.021*"accused" + 0.011*"v" + 0.011*"p" + 0.008*"pp" + 0.008*"victim" + 0.008*"statement" + 0.007*"para" + 0.007*"version" + 0.007*"charge" + 0.007*"appellant"'), (1, '0.008*"plaintiff" + 0.007*"defendant" + 0.006*"would" + 0.006*"