### Tips from prof

- Narrow scope of work (e.g. court level)

- Could try both binary/multi-class model outcomes and compare the performance 

- Change user from layperson to legal professional (and mention that this project is a stepping stone towards having layperson use the model)

- Link features to predicted outcome (if time permits can try using XGBoost with LIME for model interpretability)

- Can also try to see accuracy of models with different areas of law, lowest accuracy may be hardest area of law to predict


### Data setup

In [97]:
import os
import re
import spacy 
from spacy import displacy
import json
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import nltk
import pyLDAvis
import pyLDAvis.gensim_models

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ng_ho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ng_ho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [98]:
# Load CSV files into DataFrames
areas_of_law_df = pd.read_csv("data/prediction_data/areas_of_law.csv")
coram_df = pd.read_csv("data/prediction_data/coram.csv")
fact_themes_df = pd.read_csv("data/prediction_data/fact_themes.csv")
sg_legal_cases_df = pd.read_csv("data/prediction_data/sg_legal_cases_dataset.csv")
target_rulings_df = pd.read_csv("data/prediction_data/target_rulings.csv")
# Load the JSON file into a dictionary
with open('data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
issues_df = pd.DataFrame(issues_data)

# Load the JSON file into a dictionary
with open('data/rawish_data/facts.json') as f:
    facts_data = [json.loads(line) for line in f]
raw_facts_df = pd.DataFrame(facts_data)

# Merge DataFrames
merged_df = pd.merge(areas_of_law_df, coram_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, fact_themes_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, sg_legal_cases_df, left_on='casename', right_on='filename', how='outer')
# merged_df = pd.merge(merged_df, issues_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, target_rulings_df, on='casename', how='outer')

merged_df.drop(columns=['Unnamed: 0'], inplace=True)
merged_df.drop(columns=['filename'], inplace=True)

# Display the resulting DataFrame
print(merged_df.head())

           casename                                        area_of_law  \
0   2000_SGCA_1.pdf  {'civil procedure': ['pleadings'], 'res judica...   
1  2000_SGCA_10.pdf  {'contract': ['formation'], 'equity': ['defenc...   
2  2000_SGCA_11.pdf  {'contract': ['discharge'], 'damages': ['asses...   
3  2000_SGCA_12.pdf  {'courts and jurisdiction': ['court of appeal'...   
4  2000_SGCA_13.pdf                     {'criminal law': ['offences']}   

                                           Coram  themes court_level  \
0  ['Chao Hick Tin , L P Thean , Yong Pung How']     5.0        SGCA   
1   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
2   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
3   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
4   ['Chao Hick Tin , Lai Kew Chai , L P Thean']    12.0        SGCA   

         target  
0    Favourable  
1    Favourable  
2    No outcome  
3  Unfavourable  
4  Unfavourable  


### Data Preprocessing

In [99]:
nan_counts = merged_df.isna().sum()
print(nan_counts)

#nas are probably those reassigned cases, coram has 7, i just drop them for now
na_target_rows = merged_df[merged_df['target'].isna()]
print(na_target_rows)

merged_df.dropna(axis=0, inplace=True)
print(merged_df.isna().sum())

#remove empty lists
merged_df = merged_df.query("area_of_law != '[]'")

#target is unbalanced
target_counts = merged_df['target'].value_counts()
print(target_counts)

casename        0
area_of_law     0
Coram           7
themes         47
court_level     0
target         47
dtype: int64
              casename area_of_law  \
241  2000_SGHC_257.pdf          []   
274  2000_SGHC_290.pdf          []   
412   2001_SGCA_66.pdf          []   
432  2001_SGHC_101.pdf          []   
438  2001_SGHC_108.pdf          []   
442  2001_SGHC_111.pdf          []   
448  2001_SGHC_118.pdf          []   
457  2001_SGHC_128.pdf          []   
460  2001_SGHC_130.pdf          []   
462  2001_SGHC_132.pdf          []   
475  2001_SGHC_148.pdf          []   
478  2001_SGHC_150.pdf          []   
479  2001_SGHC_151.pdf          []   
489  2001_SGHC_163.pdf          []   
498  2001_SGHC_174.pdf          []   
536  2001_SGHC_214.pdf          []   
537  2001_SGHC_215.pdf          []   
544  2001_SGHC_222.pdf          []   
546  2001_SGHC_224.pdf          []   
550  2001_SGHC_228.pdf          []   
551  2001_SGHC_229.pdf          []   
555  2001_SGHC_232.pdf          []   
564  

In [100]:
merged_df['area_of_law'] = merged_df['area_of_law'].apply(ast.literal_eval)
merged_df['Coram'] = merged_df['Coram'].apply(ast.literal_eval)
merged_df.head(3)

Unnamed: 0,casename,area_of_law,Coram,themes,court_level,target
0,2000_SGCA_1.pdf,"{'civil procedure': ['pleadings'], 'res judica...","[Chao Hick Tin , L P Thean , Yong Pung How]",5.0,SGCA,Favourable
1,2000_SGCA_10.pdf,"{'contract': ['formation'], 'equity': ['defenc...","[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,Favourable
2,2000_SGCA_11.pdf,"{'contract': ['discharge'], 'damages': ['asses...","[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,No outcome


### Flatten areas_of_law

In [101]:
all_areas = []

for index, row in merged_df.iterrows():

    areas = row['area_of_law']
    flat_areas = []
    for main_area, sub_areas in areas.items():
        flat_areas.append(main_area)
        flat_areas.extend(sub_areas)
    all_areas.append(flat_areas)

### One hot encoding

In [102]:
# one-hot encode aol
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(all_areas)

binary_aol_df = pd.DataFrame(binary_features, columns=mlb.classes_)

processed_df = pd.concat([merged_df.drop('area_of_law', axis=1), binary_aol_df], axis=1)

print(processed_df.head(3))

           casename                                        Coram  themes  \
0   2000_SGCA_1.pdf  [Chao Hick Tin , L P Thean , Yong Pung How]     5.0   
1  2000_SGCA_10.pdf   [Chao Hick Tin , Tan Lee Meng , L P Thean]     5.0   
2  2000_SGCA_11.pdf   [Chao Hick Tin , Tan Lee Meng , L P Thean]     5.0   

  court_level      target  "a larger sum being repaid"  "abet"  \
0        SGCA  Favourable                          0.0     0.0   
1        SGCA  Favourable                          0.0     0.0   
2        SGCA  No outcome                          0.0     0.0   

   "an interest in any matter"  "any action proposed or contemplated"  \
0                          0.0                                    0.0   
1                          0.0                                    0.0   
2                          0.0                                    0.0   

   "any person"  ...  writ of  seizure and sale  writ of summons  \
0           0.0  ...                        0.0              0.0   
1

In [103]:
processed_df

Unnamed: 0,casename,Coram,themes,court_level,target,"""a larger sum being repaid""","""abet""","""an interest in any matter""","""any action proposed or contemplated""","""any person""",...,writ of seizure and sale,writ of summons,wrongful detention of property belonging to another,wrongful dismissal,young offenders],“any claim hereunder”,“any fire accidentally begin”,“any person interested in the charity”,“charity proceedings”,“rash” and “negligent”
0,2000_SGCA_1.pdf,"[Chao Hick Tin , L P Thean , Yong Pung How]",5.0,SGCA,Favourable,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2000_SGCA_10.pdf,"[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,Favourable,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2000_SGCA_11.pdf,"[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,No outcome,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2000_SGCA_12.pdf,"[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,Unfavourable,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2000_SGCA_13.pdf,"[Chao Hick Tin , Lai Kew Chai , L P Thean]",12.0,SGCA,Unfavourable,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6498,,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6499,,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6502,,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6504,,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
processed_df = processed_df[processed_df['Coram'].apply(lambda x: isinstance(x, list))]

In [105]:
# one-hot encode coram
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(processed_df['Coram'])

binary_coram_df = pd.DataFrame(binary_features, columns=mlb.classes_)

processed_df = pd.concat([processed_df.drop('Coram', axis=1), binary_coram_df], axis=1)

print(processed_df.head())

           casename  themes court_level        target  \
0   2000_SGCA_1.pdf     5.0        SGCA    Favourable   
1  2000_SGCA_10.pdf     5.0        SGCA    Favourable   
2  2000_SGCA_11.pdf     5.0        SGCA    No outcome   
3  2000_SGCA_12.pdf     5.0        SGCA  Unfavourable   
4  2000_SGCA_13.pdf    12.0        SGCA  Unfavourable   

   "a larger sum being repaid"  "abet"  "an interest in any matter"  \
0                          0.0     0.0                          0.0   
1                          0.0     0.0                          0.0   
2                          0.0     0.0                          0.0   
3                          0.0     0.0                          0.0   
4                          0.0     0.0                          0.0   

   "any action proposed or contemplated"  "any person"  "appeal"  ...  \
0                                    0.0           0.0       0.0  ...   
1                                    0.0           0.0       0.0  ...   
2          

### Topic Modelling

#### Helper functions for Topic Modelling 

In [106]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    legal_stopwords = ('appellant', 'respondent', 'plaintiff', 'defendant', 'mr', 'mrs', 'dr', 'mdm', 'court','version', 'hr', 'would', 'case', 'sghc', 'court', 'sgca', 'slr', 'sgdc', 'also', 'first', 'person', 'statement', 'line', 'para', 'fact', 'one', 'may', 'time', 'could', 'next')
    stop_words.update(legal_stopwords)
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

#### Topic Modelling For Facts

In [107]:
data = []

raw_facts_df = raw_facts_df[['casename', 'facts']]    
raw_facts_df['processed_facts'] = raw_facts_df['facts'].apply(preprocess_text)
raw_facts_df.drop(columns=['facts'], inplace=True)
print(raw_facts_df["processed_facts"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for fact in raw_facts_df['processed_facts']:
    for word in fact:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# print(lda_model.show_topics())

# best_coherence = -1
# best_lda = None
# for num_topics in range(5, 31, 5):
#     # Train LDA model
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=dictionary,
#                                                 num_topics=num_topics,
#                                                 random_state=42)
    
#     # Compute coherence score
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
#     coherence_score = coherence_model_lda.get_coherence()
    
#     print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
#     if coherence_score > best_coherence:
#         best_coherence = coherence_score
#         best_topic = num_topics
# print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 10 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=10,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)

raw_facts_df['facts_topic'] = topics
processed_df = pd.merge(processed_df, issues_df, on='casename', how='outer')

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [fact, widow, tan, geok, tee, deceased, sue, c...
1       [fact, surrounding, circumstance, including, a...
2       [background, appellant, french, company, secon...
3       [background, microsoft, adobe, autodesk, compa...
4       [fact, mere, assertion, suffice, exh, said, st...
                              ...                        
8515    [fact, accused, low, sze, song, low, year, old...
8516    [fact, giving, opinion, representation, amount...
8517    [fact, party, karan, bagga, litigant, proceedi...
8518                                                   []
8519    [fact, party, towa, company, incorporated, jap...
Name: processed_facts, Length: 8520, dtype: object
Finished preprocessing text
Performing topic modelling


#### Topic Modelling For Issues

In [108]:
data = []

issues_df = issues_df[['casename', 'issues']]    
issues_df['processed_issues'] = issues_df['issues'].apply(preprocess_text)
issues_df.drop(columns=['issues'], inplace=True)
print(issues_df["processed_issues"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for issue in issues_df['processed_issues']:
    for word in issue:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# print(lda_model.show_topics())

# best_coherence = -1
# best_lda = None
# for num_topics in range(5, 31, 5):
#     # Train LDA model
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=dictionary,
#                                                 num_topics=num_topics,
#                                                 random_state=42)
#     # Compute coherence score
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
#     coherence_score = coherence_model_lda.get_coherence()
    
#     print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
#     if coherence_score > best_coherence:
#         best_coherence = coherence_score
#         best_topic = num_topics
# print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 25 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=25,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)
    
issues_df['issues_topic'] = topics
processed_df = pd.merge(processed_df, issues_df, on='casename', how='outer')

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [claim, dismissed, cost, high, decision, fook,...
1       [claim, decision, appeal, brought, background,...
2       [appeal, question, arise, appeal, follows, app...
3       [appeal, assistant, registrar, ground, judgmen...
4       [appeal, january, dismissed, give, reason, evi...
                              ...                        
8515    [issue, sub, issue, arise, consideration, whet...
8516    [claim, conspiracy, defraud, fault, ken, sally...
8517    [issue, relating, defence, justification, qual...
8518                                                   []
8519    [background, dispute, towa, commenced, suit, a...
Name: processed_issues, Length: 8520, dtype: object
Finished preprocessing text
Performing topic modelling


In [109]:
processed_df.drop(columns=['themes'], inplace=True)
processed_df

Unnamed: 0,casename,court_level,target,"""a larger sum being repaid""","""abet""","""an interest in any matter""","""any action proposed or contemplated""","""any person""","""appeal""","""arising out of""",...,"Wong Li Kok, Alex",Woo Bih Li,Yeong Zee Kin SAR,Yong Pung How,"Yong Pung How,","Yong Pung How, Chief Justice",Zhuo Wenzhao AR,issues,processed_issues,issues_topic
0,2000_SGCA_1.pdf,SGCA,Favourable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The claim was dismissed with costs by the\nHig...,"[claim, dismissed, cost, high, decision, fook,...",6.0
1,2000_SGCA_10.pdf,SGCA,Favourable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the claim and\nagainst that decision this appe...,"[claim, decision, appeal, brought, background,...",18.0
2,2000_SGCA_11.pdf,SGCA,No outcome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,The appeal \nThe questions which arise in this...,"[appeal, question, arise, appeal, follows, app...",6.0
3,2000_SGCA_12.pdf,SGCA,Unfavourable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the appeals from the assistant registrar. In h...,"[appeal, assistant, registrar, ground, judgmen...",15.0
4,2000_SGCA_13.pdf,SGCA,Unfavourable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the appeal on 24 January 2000 and dismissed it...,"[appeal, january, dismissed, give, reason, evi...",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10275,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
10276,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
10277,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
10278,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


### splitting

In [110]:
print(processed_df)

               casename court_level        target  \
0       2000_SGCA_1.pdf        SGCA    Favourable   
1      2000_SGCA_10.pdf        SGCA    Favourable   
2      2000_SGCA_11.pdf        SGCA    No outcome   
3      2000_SGCA_12.pdf        SGCA  Unfavourable   
4      2000_SGCA_13.pdf        SGCA  Unfavourable   
...                 ...         ...           ...   
10275               NaN         NaN           NaN   
10276               NaN         NaN           NaN   
10277               NaN         NaN           NaN   
10278               NaN         NaN           NaN   
10279               NaN         NaN           NaN   

       "a larger sum being repaid"  "abet"  "an interest in any matter"  \
0                              0.0     0.0                          0.0   
1                              0.0     0.0                          0.0   
2                              0.0     0.0                          0.0   
3                              0.0     0.0                     

In [111]:
X = processed_df.drop(columns=['target'])
y = processed_df['target']

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, remaining_index in stratified_split.split(X, y):
    X_train, X_test_val = X.iloc[train_index], X.iloc[remaining_index]
    y_train, y_test_val = y.iloc[train_index], y.iloc[remaining_index]

#balanced dataset (target variable was imbalanced Favourable 5006 Unfavourable 2523 No outcome 984)
#randomly found one online, can be changed -> need to check am i doing this right 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

#split further from X_test_val into X_val and X_test
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42, stratify=y_test_val)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


ValueError: Input contains NaN

### Feature Engineering

In [None]:
## One hot encoding
## vector embedding
## pipeline

### Modeling

In [None]:
# Perform modelling

### Evaluation

In [None]:
## Perform Evaluation