### Tips from prof

- Narrow scope of work (e.g. court level)

- Could try both binary/multi-class model outcomes and compare the performance 

- Change user from layperson to legal professional (and mention that this project is a stepping stone towards having layperson use the model)

- Link features to predicted outcome (if time permits can try using XGBoost with LIME for model interpretability)

- Can also try to see accuracy of models with different areas of law, lowest accuracy may be hardest area of law to predict


### Data setup

In [47]:
import pandas as pd
import json
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

In [48]:
# Load CSV files into DataFrames
areas_of_law_df = pd.read_csv("data/prediction_data/areas_of_law.csv")
coram_df = pd.read_csv("data/prediction_data/coram.csv")
fact_themes_df = pd.read_csv("data/prediction_data/fact_themes.csv")
sg_legal_cases_df = pd.read_csv("data/prediction_data/sg_legal_cases_dataset.csv")
target_rulings_df = pd.read_csv("data/prediction_data/target_rulings.csv")
# Load the JSON file into a dictionary
with open('data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
issues_df = pd.DataFrame(issues_data)

# Merge DataFrames
merged_df = pd.merge(areas_of_law_df, coram_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, fact_themes_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, sg_legal_cases_df, left_on='casename', right_on='filename', how='outer')
merged_df = pd.merge(merged_df, issues_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, target_rulings_df, on='casename', how='outer')

merged_df.drop(columns=['Unnamed: 0'], inplace=True)
merged_df.drop(columns=['filename'], inplace=True)

# Display the resulting DataFrame
print(merged_df)

              casename                                        area_of_law  \
0      2000_SGCA_1.pdf  {'civil procedure': ['pleadings'], 'res judica...   
1     2000_SGCA_10.pdf  {'contract': ['formation'], 'equity': ['defenc...   
2     2000_SGCA_11.pdf  {'contract': ['discharge'], 'damages': ['asses...   
3     2000_SGCA_12.pdf  {'courts and jurisdiction': ['court of appeal'...   
4     2000_SGCA_13.pdf                     {'criminal law': ['offences']}   
...                ...                                                ...   
8562  2023_SGHC_95.pdf           {'criminal law': ['statutory offences']}   
8563  2023_SGHC_96.pdf      {'tort': ['conspiracy', 'misrepresentation']}   
8564  2023_SGHC_97.pdf                 {'civil procedure': ['witnesses']}   
8565  2023_SGHC_98.pdf     {'insolvency law': ['schemes of arrangement']}   
8566  2023_SGHC_99.pdf            {'intellectual property': ['remedies']}   

                                              Coram  themes court_level  \


### Data Preprocessing

In [49]:
nan_counts = merged_df.isna().sum()
print(nan_counts)

#nas are probably those reassigned cases, coram has 7, i just drop them for now
na_target_rows = merged_df[merged_df['target'].isna()]
print(na_target_rows)

merged_df.dropna(axis=0, inplace=True)
print(merged_df.isna().sum())

#target is unbalanced
target_counts = merged_df['target'].value_counts()
print(target_counts)

casename        0
area_of_law     0
Coram           7
themes         47
court_level     0
target         47
issues         47
dtype: int64
              casename area_of_law  \
241  2000_SGHC_257.pdf          []   
274  2000_SGHC_290.pdf          []   
412   2001_SGCA_66.pdf          []   
432  2001_SGHC_101.pdf          []   
438  2001_SGHC_108.pdf          []   
442  2001_SGHC_111.pdf          []   
448  2001_SGHC_118.pdf          []   
457  2001_SGHC_128.pdf          []   
460  2001_SGHC_130.pdf          []   
462  2001_SGHC_132.pdf          []   
475  2001_SGHC_148.pdf          []   
478  2001_SGHC_150.pdf          []   
479  2001_SGHC_151.pdf          []   
489  2001_SGHC_163.pdf          []   
498  2001_SGHC_174.pdf          []   
536  2001_SGHC_214.pdf          []   
537  2001_SGHC_215.pdf          []   
544  2001_SGHC_222.pdf          []   
546  2001_SGHC_224.pdf          []   
550  2001_SGHC_228.pdf          []   
551  2001_SGHC_229.pdf          []   
555  2001_SGHC_232.pdf   

In [51]:
#not sure how to deal with area_of_law and coram 
merged_df.drop(columns=['area_of_law', 'Coram'], inplace=True)

#### splitting

In [42]:
X = merged_df.drop(columns=['target'])
y = merged_df['target']

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, remaining_index in stratified_split.split(X, y):
    X_train, X_test_val = X.iloc[train_index], X.iloc[remaining_index]
    y_train, y_test_val = y.iloc[train_index], y.iloc[remaining_index]

#balanced dataset (target variable was imbalanced Favourable 5006 Unfavourable 2523 No outcome 984)
#randomly found one online, can be changed -> need to check am i doing this right 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

#split further from X_test_val into X_val and X_test
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42, stratify=y_test_val)

### Feature Engineering

In [None]:
## One hot encoding
## vector embedding
## pipeline

### Modeling

In [None]:
# Perform modelling

### Evaluation

In [None]:
## Perform Evaluation