### Tips from prof

- Narrow scope of work (e.g. court level)

- Could try both binary/multi-class model outcomes and compare the performance 

- Change user from layperson to legal professional (and mention that this project is a stepping stone towards having layperson use the model)

- Link features to predicted outcome (if time permits can try using XGBoost with LIME for model interpretability)

- Can also try to see accuracy of models with different areas of law, lowest accuracy may be hardest area of law to predict


### Data setup

In [None]:
import re
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import warnings
# Ignore the DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benhz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benhz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load CSV files into DataFrames
areas_of_law_df = pd.read_csv("data/prediction_data/areas_of_law.csv")
coram_df = pd.read_csv("data/prediction_data/coram.csv")
sg_legal_cases_df = pd.read_csv("data/prediction_data/sg_legal_cases_dataset.csv")
target_rulings_df = pd.read_csv("data/prediction_data/target_rulings.csv")
issues_facts_df = pd.read_csv("data/prediction_data/issues_facts_topic.csv")
# Load the JSON file into a dictionary
with open('data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
issues_df = pd.DataFrame(issues_data)

# Load the JSON file into a dictionary
with open('data/prediction_data/updated_facts.json') as f:
    facts_data = [json.loads(line) for line in f]
raw_facts_df = pd.DataFrame(facts_data)
raw_facts_df["casename"] = raw_facts_df["casename"].apply(lambda case: case + ".pdf" if case[-4:] != ".pdf" else case)
raw_facts_df["facts"] = raw_facts_df["facts"].fillna("") + raw_facts_df["fact"].fillna("")
raw_facts_df = raw_facts_df.drop(columns=["fact"])

# Merge DataFrames
merged_df = pd.merge(areas_of_law_df, sg_legal_cases_df, on='casename', how='inner')
merged_df = pd.merge(merged_df, issues_df, on='casename', how='inner')
merged_df = pd.merge(merged_df, raw_facts_df, on='casename', how='inner')
merged_df = pd.merge(merged_df, issues_facts_df, on='casename', how='inner')
merged_df = pd.merge(merged_df, target_rulings_df, on='casename', how='inner')

try:
    merged_df.drop(columns=['Unnamed: 0'], inplace=True)
except:
    pass
# Display the resulting DataFrame
print(merged_df.head())

           casename                                        area_of_law  \
0   2000_SGCA_1.pdf  {'civil procedure': ['pleadings'], 'res judica...   
1  2000_SGCA_10.pdf  {'contract': ['formation'], 'equity': ['defenc...   
2  2000_SGCA_11.pdf  {'contract': ['discharge'], 'damages': ['asses...   
3  2000_SGCA_12.pdf  {'courts and jurisdiction': ['court of appeal'...   
4  2000_SGCA_13.pdf                     {'criminal law': ['offences']}   

  court_level                                             issues  \
0        SGCA  The claim was dismissed with costs by the\nHig...   
1        SGCA  the claim and\nagainst that decision this appe...   
2        SGCA  The appeal \nThe questions which arise in this...   
3        SGCA  the appeals from the assistant registrar. In h...   
4        SGCA  the appeal on 24 January 2000 and dismissed it...   

                                               facts  issues_topic  \
0  The facts\nThe appellant is the widow of one T...            12   
1  fac

In [None]:
merged_df = merged_df.dropna()
merged_df.isna().sum()

casename        0
area_of_law     0
court_level     0
issues          0
facts           0
issues_topic    0
facts_topic     0
target          0
dtype: int64

### Data Preprocessing and Feature Engineering 

Remove duplicate coram names and roles

In [None]:
def clean_coram_names(coram_list):
    all_names = set()
    for item in coram_list:
        split_names = re.split(r';\s(?![a-zA-Z]+\s)', item)
        for name in split_names:
            if ';' in name and not re.search(r';\s[a-zA-Z]+$', name):
                sub_names = name.split(';')
                all_names.update([n.strip() for n in sub_names if n.strip()])
            else:
                all_names.add(name.strip())
    return list(all_names)

def remove_coram_roles(coram_list):
    roles = [' CJ', ' AG', ' J', ' DCJ', ' JA', ' AR', ' JC', 'SAR']
    for role in roles:
        coram_list = [re.sub(rf'{role}$', '', name) for name in coram_list]
    return coram_list

In [None]:
coram_df = coram_df.dropna()
for i, coram_str in enumerate(coram_df['Coram']):
    coram = ast.literal_eval(coram_str)
    
    coram_modified = clean_coram_names(coram)
    coram_modified = remove_coram_roles(coram_modified)
    coram_df.at[i, 'Coram'] = str(coram_modified)
merged_df = pd.merge(merged_df, coram_df, on='casename', how='outer')

try:
    merged_df.drop(columns=['Unnamed: 0'], inplace=True)
except:
    pass

In [None]:
nan_counts = merged_df.isna().sum()
print(nan_counts)

#nas are probably those reassigned cases, coram has 7, i just drop them for now
na_target_rows = merged_df[merged_df['target'].isna()]
print(na_target_rows)

merged_df.dropna(axis=0, inplace=True)
print(merged_df.isna().sum())

#remove empty lists
merged_df = merged_df.query("area_of_law != '[]'")

#target is unbalanced
target_counts = merged_df['target'].value_counts()
print(target_counts)

merged_df = merged_df.reset_index(drop=True) # prevent nan values from appearing after one-hot

casename         7
area_of_law     54
court_level     54
issues          54
facts           54
issues_topic    54
facts_topic     54
target          54
Coram            7
dtype: int64


               casename area_of_law court_level issues facts  issues_topic  \
241   2000_SGHC_257.pdf         NaN         NaN    NaN   NaN           NaN   
274   2000_SGHC_290.pdf         NaN         NaN    NaN   NaN           NaN   
412    2001_SGCA_66.pdf         NaN         NaN    NaN   NaN           NaN   
432   2001_SGHC_101.pdf         NaN         NaN    NaN   NaN           NaN   
438   2001_SGHC_108.pdf         NaN         NaN    NaN   NaN           NaN   
442   2001_SGHC_111.pdf         NaN         NaN    NaN   NaN           NaN   
448   2001_SGHC_118.pdf         NaN         NaN    NaN   NaN           NaN   
457   2001_SGHC_128.pdf         NaN         NaN    NaN   NaN           NaN   
460   2001_SGHC_130.pdf         NaN         NaN    NaN   NaN           NaN   
462   2001_SGHC_132.pdf         NaN         NaN    NaN   NaN           NaN   
475   2001_SGHC_148.pdf         NaN         NaN    NaN   NaN           NaN   
478   2001_SGHC_150.pdf         NaN         NaN    NaN   NaN   

In [None]:
merged_df['area_of_law'] = merged_df['area_of_law'].apply(ast.literal_eval)
merged_df['Coram'] = merged_df['Coram'].apply(ast.literal_eval)
merged_df.head(3)

Unnamed: 0,casename,area_of_law,court_level,issues,facts,issues_topic,facts_topic,target,Coram
0,2000_SGCA_1.pdf,"{'civil procedure': ['pleadings'], 'res judica...",SGCA,The claim was dismissed with costs by the\nHig...,The facts\nThe appellant is the widow of one T...,12.0,7.0,Favourable,"[Chan Sek Keong, Andrew Phang Boon Leong, V K ..."
1,2000_SGCA_10.pdf,"{'contract': ['formation'], 'equity': ['defenc...",SGCA,the claim and\nagainst that decision this appe...,facts and surrounding circumstances including ...,8.0,3.0,Favourable,"[Chao Hick Tin, Andrew Phang Boon Leong, V K R..."
2,2000_SGCA_11.pdf,"{'contract': ['discharge'], 'damages': ['asses...",SGCA,The appeal \nThe questions which arise in this...,"Background \nThe first appellants, a French co...",0.0,12.0,No outcome,"[Chan Sek Keong, Andrew Phang Boon Leong, Tan ..."


Flatten areas_of_law

In [None]:
all_areas = []

for index, row in merged_df.iterrows():

    areas = row['area_of_law']
    flat_areas = []
    for main_area, sub_areas in areas.items():
        flat_areas.append(main_area)
        for sarea in sub_areas.copy():
            if len(sarea) > 33:
                sub_areas.remove(sarea)
        flat_areas.extend(sub_areas)
    all_areas.append(flat_areas)

In [None]:
for area in all_areas:
    if area == []:
        print(area)

One-hot Encoding

In [None]:
# one-hot encode aol
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(all_areas)

binary_aol_df = pd.DataFrame(binary_features, columns=mlb.classes_)
binary_aol_df = binary_aol_df.reset_index(drop=True)
processed_df = pd.concat([merged_df.drop('area_of_law', axis=1), binary_aol_df], axis=1)
processed_df = processed_df[processed_df['Coram'].apply(lambda x: isinstance(x, list))]
print(processed_df.head(3))

           casename court_level  \
0   2000_SGCA_1.pdf        SGCA   
1  2000_SGCA_10.pdf        SGCA   
2  2000_SGCA_11.pdf        SGCA   

                                              issues  \
0  The claim was dismissed with costs by the\nHig...   
1  the claim and\nagainst that decision this appe...   
2  The appeal \nThe questions which arise in this...   

                                               facts  issues_topic  \
0  The facts\nThe appellant is the widow of one T...          12.0   
1  facts and surrounding circumstances including ...           8.0   
2  Background \nThe first appellants, a French co...           0.0   

   facts_topic      target                                              Coram  \
0          7.0  Favourable  [Chan Sek Keong, Andrew Phang Boon Leong, V K ...   
1          3.0  Favourable  [Chao Hick Tin, Andrew Phang Boon Leong, V K R...   
2         12.0  No outcome  [Chan Sek Keong, Andrew Phang Boon Leong, Tan ...   

   "a larger sum being repai

In [None]:
# one-hot encode coram
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(processed_df['Coram'])

binary_coram_df = pd.DataFrame(binary_features, columns=mlb.classes_)
binary_coram_df = binary_coram_df.reset_index(drop=True)
processed_df = pd.concat([processed_df.drop('Coram', axis=1), binary_coram_df], axis=1)

print(processed_df.head())

           casename court_level  \
0   2000_SGCA_1.pdf        SGCA   
1  2000_SGCA_10.pdf        SGCA   
2  2000_SGCA_11.pdf        SGCA   
3  2000_SGCA_12.pdf        SGCA   
4  2000_SGCA_13.pdf        SGCA   

                                              issues  \
0  The claim was dismissed with costs by the\nHig...   
1  the claim and\nagainst that decision this appe...   
2  The appeal \nThe questions which arise in this...   
3  the appeals from the assistant registrar. In h...   
4  the appeal on 24 January 2000 and dismissed it...   

                                               facts  issues_topic  \
0  The facts\nThe appellant is the widow of one T...          12.0   
1  facts and surrounding circumstances including ...           8.0   
2  Background \nThe first appellants, a French co...           0.0   
3  Background\nMicrosoft, Adobe and Autodesk are ...          27.0   
4  facts. Mere assertion would not suffice. In ex...          28.0   

   facts_topic        target  "

In [None]:
processed_df['SGCA'] = processed_df['court_level'].apply(lambda x: 1 if x == 'SGCA' else 0)
processed_df['SGHC'] = processed_df['court_level'].apply(lambda x: 1 if x == 'SGHC' else 0)
processed_df = processed_df.drop('court_level', axis=1)

In [None]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    legal_stopwords = ('appellant', 'respondent', 'plaintiff', 'defendant', 'mr', 'mrs', 'dr', 'mdm', 'court','version', 'hr', 'would', 'case', 'sghc', 'court', 'sgca', 'slr', 'sgdc', 'also', 'first', 'person', 'statement', 'line', 'para', 'fact', 'one', 'may', 'time', 'could', 'next', 'legal', 'issues', 'issue')
    stop_words.update(legal_stopwords)
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

processed_df['processed_facts'] = processed_df['facts'].apply(preprocess_text)
processed_df.drop(columns=['facts'], inplace=True)

processed_df['processed_issues'] = processed_df['issues'].apply(preprocess_text)
processed_df.drop(columns=['issues'], inplace=True)

### Modelling

#### Modeling (CNN)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

Train & Test set for CNN

In [None]:
X = processed_df.drop(columns=['target','casename'])
y = processed_df['target']

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, remaining_index in stratified_split.split(X, y):
    X_train, X_test_val = X.iloc[train_index], X.iloc[remaining_index]
    y_train, y_test_val = y.iloc[train_index], y.iloc[remaining_index]

#balanced dataset (target variable was imbalanced Favourable 5006 Unfavourable 2523 No outcome 984)
#randomly found one online, can be changed -> need to check am i doing this right 
from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import SMOTE
# smt = SMOTE(random_state=42)
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
# X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)

#split further from X_test_val into X_val and X_test
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42, stratify=y_test_val)

In [None]:
# Vectorizing textual features using TF-IDF for X_train_resampled
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_text = tfidf_vectorizer.fit_transform(X_train_resampled['processed_facts'].astype('U') + ' ' + X_train_resampled['processed_issues'].astype('U'))
X_train_text = pd.DataFrame(X_train_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Drop original text columns and concatenate TF-IDF features
X_train_resampled = X_train_resampled.drop(['processed_facts', 'processed_issues'], axis=1)
X_train_resampled = pd.concat([X_train_resampled.reset_index(drop=True), X_train_text], axis=1)

# Vectorizing textual features using TF-IDF for X_val
X_val_text = tfidf_vectorizer.transform(X_val['processed_facts'].astype('U') + ' ' + X_val['processed_issues'].astype('U'))
X_val_text = pd.DataFrame(X_val_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Drop original text columns and concatenate TF-IDF features
X_val = X_val.drop(['processed_facts', 'processed_issues'], axis=1)
X_val = pd.concat([X_val.reset_index(drop=True), X_val_text], axis=1)

# Vectorizing textual features using TF-IDF for X_test
X_test_text = tfidf_vectorizer.transform(X_test['processed_facts'].astype('U') + ' ' + X_test['processed_issues'].astype('U'))
X_test_text = pd.DataFrame(X_test_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Drop original text columns and concatenate TF-IDF features
X_test = X_test.drop(['processed_facts', 'processed_issues'], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_text], axis=1)

Convert target variable to continuous

In [None]:
# X_train_resampled = X_train_resampled.drop(columns=['processed_facts', 'processed_issues'])
# X_test = X_test.drop(columns=['processed_facts', 'processed_issues'])
# X_val = X_val.drop(columns=['processed_facts', 'processed_issues'])

mapping = {'Favourable': 1, 'Unfavourable': 0, 'No outcome':0.5}

y_train_resampled, y_test, y_val = y_train_resampled.copy().map(mapping), y_test.copy().map(mapping), y_val.copy().map(mapping)

Hyper params

In [None]:
class Args:
  epochs = 20
  lr = 0.001
  use_cuda=False
  gamma = 0.7
  log_interval = 10
  seed = 1

args = Args()

device = torch.device("cuda" if args.use_cuda else "cpu")

In [None]:
X_train_resampled = X_train_resampled.iloc[:, :].copy()
X_train_resampled = torch.tensor(X_train_resampled.values, dtype=torch.float32).to(device)
print(f'Shape of X_train_resampled: {X_train_resampled.shape}')

X_test = X_test.iloc[:, :].copy()
X_test = torch.tensor(X_test.values, dtype=torch.float32).to(device)
print(f'Shape of X_test: {X_test.shape}')

X_val = X_val.iloc[:,:].copy()
X_val = torch.tensor(X_val.values, dtype=torch.float32).to(device)
print(f'Shape of X_val: {X_val.shape}')

Shape of X_train_resampled: torch.Size([8274, 2484])
Shape of X_test: torch.Size([1019, 2484])
Shape of X_val: torch.Size([1019, 2484])


In [None]:
y_train_resampled, y_test, y_val = torch.tensor(y_train_resampled.values).to(device), torch.tensor(y_test.values).to(device), torch.tensor(y_val.values).to(device)

X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0],1,X_train_resampled.shape[1])
X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])
X_val = X_val.reshape(X_val.shape[0],1,X_val.shape[1])
print(X_train_resampled.shape)

torch.Size([8274, 1, 2484])


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, 3, 1,1, bias=True)
        # Define the first 1D convolution layer. Takes 1 input channel, outputs 32 channels, kernel size is 3, stride is 1, padding is 1.
        self.Bn1 = nn.BatchNorm1d(64)
        # Apply Batch Normalization to the output of the first convolutional layer.
        self.dropout = nn.Dropout(0.3)
        self.pool1 = nn.AvgPool1d(kernel_size=2, stride=2)
        # Apply 1D Average Pooling after the first Batch Normalization. The kernel size and stride are 2.

        self.conv2 = nn.Conv1d(64, 64, 3, 1,1, bias=True)
        self.Bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.AvgPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(39744, 100, bias=True)
        # Define the first fully connected layer. It takes 25472 inputs and outputs 100 nodes.

        self.fc2 = nn.Linear(100, 30, bias=True)
        # Define the second fully connected layer. It takes 100 inputs and outputs 50 nodes.

        self.fc3 = nn.Linear(30, 3, bias=True)
        # Define the third fully connected layer (output layer). It takes 50 inputs and outputs 3 nodes.


    def forward(self, x):
        x = F.tanh(self.Bn1(self.conv1(x)))
        # Pass the input through the first convolutional layer, then Batch Normalization, and then apply ReLU activation.
        x = self.dropout(x)
        x = self.pool1(x)
        # Apply Average Pooling to the output of the previous step.
        x = F.tanh(self.Bn2(self.conv2(x)))
        x = self.dropout(x)
        x = self.pool2(x)
        x = torch.flatten(x, 1)
        # Flatten the output from the previous step. This is necessary because fully connected layers expect a 1D input.
        x = self.fc1(x)
        # Pass the output through the first fully connected layer.
        x = F.tanh(self.fc2(x))
        # Pass the output through the second fully connected layer with tanh activation.
        x = self.fc3(x)
        # Pass the output through the third fully connected layer. This is the output of the network.
        return x

def train(args, model, device, train_loader, optimizer, epoch):
    model.train()  # Set the model to training mode

    for batch_idx, (data, target) in enumerate(train_loader):  # Loop over each batch from the training set
        data, target = data.to(device), target.to(device)  # Move the data to the device that is used

        target = target.long()  # Make sure that target data is long type (necessary for loss function)

        optimizer.zero_grad()  # Clear gradients from the previous training step
        output = model(data)  # Run forward pass (model predictions)
        #print(output.shape)
        loss = F.cross_entropy(output, target)  # Calculate the loss between the output and target
        loss.backward()  # Perform backpropagation (calculate gradients of loss w.r.t. parameters)
        optimizer.step()  # Update the model parameters

        if batch_idx % args.log_interval == 0:  # Print log info for specified interval
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))



def test(model, device, test_loader):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0

    with torch.no_grad():  # Deactivates autograd, reduces memory usage and speeds up computations
        for data, target in test_loader:  # Loop over each batch from the testing set
            
            data, target = data.to(device), target.to(device)  # Move the data to the device that is used

            target = target.long()  # Convert target to long after adjusting value
            output = model(data)  # Run forward pass (model predictions)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # Sum up the batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability as the predicted output
            correct += pred.eq(target.view_as(pred)).sum().item()  # Count correct predictions

    test_loss /= len(test_loader.dataset)  # Calculate the average loss

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))
    return correct  # Return the number of correctly classified samples


In [None]:
torch.manual_seed(args.seed)

model = Net().to(device)

for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

#Form training and testing dataset
optimizer = optim.Adam(model.parameters(), lr=args.lr)

train_dataset = torch.utils.data.TensorDataset(X_train_resampled, y_train_resampled)
test_dataset = torch.utils.data.TensorDataset(X_val, y_val)
val_dataset = torch.utils.data.TensorDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

conv1.weight 	 torch.Size([64, 1, 3])
conv1.bias 	 torch.Size([64])
Bn1.weight 	 torch.Size([64])
Bn1.bias 	 torch.Size([64])
Bn1.running_mean 	 torch.Size([64])
Bn1.running_var 	 torch.Size([64])
Bn1.num_batches_tracked 	 torch.Size([])
conv2.weight 	 torch.Size([64, 64, 3])
conv2.bias 	 torch.Size([64])
Bn2.weight 	 torch.Size([64])
Bn2.bias 	 torch.Size([64])
Bn2.running_mean 	 torch.Size([64])
Bn2.running_var 	 torch.Size([64])
Bn2.num_batches_tracked 	 torch.Size([])
fc1.weight 	 torch.Size([100, 39744])
fc1.bias 	 torch.Size([100])
fc2.weight 	 torch.Size([30, 100])
fc2.bias 	 torch.Size([30])
fc3.weight 	 torch.Size([3, 30])
fc3.bias 	 torch.Size([3])


In [None]:
#Model training
CNN_acc = 0
for epoch in range(1, args.epochs + 1):
    train(args, model, device, train_loader, optimizer, epoch)
    ACC_ = test(model, device, test_loader)
    if ACC_>CNN_acc or ACC_ == CNN_acc:
        CNN_acc = ACC_
        torch.save(model.state_dict(), "Baseline_CNN.pt")

    scheduler.step()

print(CNN_acc)




Test set: Average loss: 0.9051, Accuracy: 428/1019 (42%)


Test set: Average loss: 0.7460, Accuracy: 544/1019 (53%)


Test set: Average loss: 1.0278, Accuracy: 477/1019 (47%)


Test set: Average loss: 0.8147, Accuracy: 520/1019 (51%)


Test set: Average loss: 0.7640, Accuracy: 535/1019 (53%)


Test set: Average loss: 0.7964, Accuracy: 517/1019 (51%)


Test set: Average loss: 0.7679, Accuracy: 537/1019 (53%)


Test set: Average loss: 0.7852, Accuracy: 538/1019 (53%)


Test set: Average loss: 0.7739, Accuracy: 543/1019 (53%)


Test set: Average loss: 0.7716, Accuracy: 543/1019 (53%)


Test set: Average loss: 0.7666, Accuracy: 549/1019 (54%)


Test set: Average loss: 0.7798, Accuracy: 539/1019 (53%)


Test set: Average loss: 0.7726, Accuracy: 543/1019 (53%)


Test set: Average loss: 0.7790, Accuracy: 540/1019 (53%)


Test set: Average loss: 0.7760, Accuracy: 543/1019 (53%)


Test set: Average loss: 0.7776, Accuracy: 540/1019 (53%)


Test set: Average loss: 0.7768, Accuracy: 542/1019 (53%

In [None]:
# Testing
model.eval()
correct_val = 0
total_val = 0
val_loss = 0
CNN_test_accuracy = 0
with torch.no_grad():
    for data, target in val_loader:
        data, target = data.to(device), target.to(device).long()
        
        output_test = model(data)
        #pred = torch.argmax(output_test, 1)
        pred = output_test.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability as the predicted output

        
        val_loss += F.cross_entropy(output_test, target) 
            
        #correct_val += (pred == target).sum().item()
        correct_val += pred.eq(target.view_as(pred)).sum().item()  # Count correct predictions

        
        total_val += target.size(0)
    
    CNN_test_accuracy = (correct_val / total_val) * 100
            
    val_loss /= len(val_loader.dataset)

    print(f"Testing Accuracy = {CNN_test_accuracy}")

Testing Accuracy = 54.85770363101079


#### Modelling (DNN)

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        # self.fc1 = nn.Linear(2484, 128, bias=True)
        # self.Bn1 = nn.BatchNorm1d(128)
        # self.fc2 = nn.Linear(128, 128, bias=True)
        # self.Bn2 = nn.BatchNorm1d(128)
        # self.fc3 = nn.Linear(128, 5, bias=True)

        # self.fc1 = nn.Linear(2484, 1024, bias=True)
        # self.Bn1 = nn.BatchNorm1d(1024)
        # self.fc2 = nn.Linear(1024, 512, bias=True)
        # self.Bn2 = nn.BatchNorm1d(512)
        # self.fc3 = nn.Linear(512, 256, bias=True)
        # self.Bn3 = nn.BatchNorm1d(256)
        # self.fc4 = nn.Linear(256, 128, bias=True)
        # self.Bn4 = nn.BatchNorm1d(128)
        # self.fc5 = nn.Linear(128, 3, bias=True)

        self.fc1 = nn.Linear(2484, 512, bias=True)
        self.Bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(512, 128, bias=True)
        self.fc3 = nn.Linear(128, 3, bias=True)

        self.dropout = nn.Dropout2d(0.3)


    def forward(self, x):
        # x = torch.flatten(x, 1)
        # x = F.leaky_relu(self.Bn1(self.fc1(x)))
        # x = F.tanh(self.Bn2(self.fc2(x)))
        # x = self.fc3(x)

        # x = torch.flatten(x, 1)
        # x = F.leaky_relu(self.fc1(x))
        # x = F.leaky_relu(self.fc2(x))
        # x = F.leaky_relu(self.fc3(x))
        # x = torch.tanh(self.fc4(x))
        # x = self.fc5(x)

        x = torch.flatten(x, 1)
        x = F.tanh(self.fc1(x)) # [leaky_relu, tanh, relu,]
        x = F.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
class DNNArgs:
  epochs = 33 # Tuning the epoch to 33 using validation set 
  lr = 0.001
  use_cuda=False
  gamma = 0.7
  log_interval = 10
  seed = 1

DNNargs = DNNArgs()

torch.manual_seed(DNNargs.seed)

dnn_model = DNN().to(device)

for param_tensor in dnn_model.state_dict():
        print(param_tensor, "\t", dnn_model.state_dict()[param_tensor].size())

#Form training and testing dataset
dnn_optimizer = optim.Adam(dnn_model.parameters(), lr=DNNargs.lr)

#Model training
ACC = 0
for epoch in range(1, DNNargs.epochs + 1):
    train(DNNargs, dnn_model, device, train_loader, dnn_optimizer, epoch)
    ACC_ = test(dnn_model, device, test_loader)
    if ACC_>ACC or ACC_ == ACC:
        ACC = ACC_
        torch.save(dnn_model.state_dict(), "Baseline_DNN.pt")

    scheduler.step()

print(ACC)


fc1.weight 	 torch.Size([512, 2484])
fc1.bias 	 torch.Size([512])
Bn1.weight 	 torch.Size([256])
Bn1.bias 	 torch.Size([256])
Bn1.running_mean 	 torch.Size([256])
Bn1.running_var 	 torch.Size([256])
Bn1.num_batches_tracked 	 torch.Size([])
fc2.weight 	 torch.Size([128, 512])
fc2.bias 	 torch.Size([128])
fc3.weight 	 torch.Size([3, 128])
fc3.bias 	 torch.Size([3])

Test set: Average loss: 0.9043, Accuracy: 466/1019 (46%)


Test set: Average loss: 0.7409, Accuracy: 537/1019 (53%)


Test set: Average loss: 0.8130, Accuracy: 525/1019 (52%)


Test set: Average loss: 0.9427, Accuracy: 536/1019 (53%)


Test set: Average loss: 0.9253, Accuracy: 545/1019 (53%)


Test set: Average loss: 1.0666, Accuracy: 554/1019 (54%)


Test set: Average loss: 1.1016, Accuracy: 550/1019 (54%)


Test set: Average loss: 1.2469, Accuracy: 573/1019 (56%)


Test set: Average loss: 1.4497, Accuracy: 555/1019 (54%)


Test set: Average loss: 1.3761, Accuracy: 563/1019 (55%)


Test set: Average loss: 1.6501, Accuracy: 5

In [None]:
# Testing
dnn_model.eval()
dnn_correct_val = 0
dnn_total_val = 0
dnn_val_loss = 0
DNN_test_accuracy = 0
with torch.no_grad():
    for data, target in val_loader:
        data, target = data.to(device), target.to(device).long()
        
        output_test = dnn_model(data)
        pred = torch.argmax(output_test, 1)
        
        dnn_val_loss += F.cross_entropy(output_test, target) 
            
        dnn_correct_val += (pred == target).sum().item()
        
        dnn_total_val += target.size(0)
    
    DNN_test_accuracy = (dnn_correct_val / dnn_total_val) * 100
            
    dnn_val_loss /= len(val_loader.dataset)

    print(f"Testing Accuracy = {DNN_test_accuracy}")

Testing Accuracy = 59.37193326790972


#### Modelling (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFE
from joblib import dump, load

Train & Test set for RF

In [None]:
X = processed_df.drop(['target', 'casename'], axis=1)
y = processed_df['target'] 

# Splitting the dataset into training and test sets
for train_index, remaining_index in stratified_split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[remaining_index]
    y_train, y_test = y.iloc[train_index], y.iloc[remaining_index]

# Handle imbalanced classes
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


In [None]:
# Vectorizing textual features using TF-IDF for X_train
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_text = tfidf_vectorizer.fit_transform(X_train_resampled['processed_facts'].astype('U') + ' ' + X_train_resampled['processed_issues'].astype('U'))
X_train_text = pd.DataFrame(X_train_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Drop original text columns and concatenate TF-IDF features
X_train_resampled = X_train_resampled.drop(['processed_facts', 'processed_issues'], axis=1)
X_train_resampled = pd.concat([X_train_resampled.reset_index(drop=True), X_train_text], axis=1)

# Vectorizing textual features using TF-IDF for X_test
X_test_text = tfidf_vectorizer.transform(X_test['processed_facts'].astype('U') + ' ' + X_test['processed_issues'].astype('U'))
X_test_text = pd.DataFrame(X_test_text.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Drop original text columns and concatenate TF-IDF features
X_test = X_test.drop(['processed_facts', 'processed_issues'], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_text], axis=1)


Hyper Param Tuning (Grid search)

In [None]:
# rf_classifier = RandomForestClassifier(random_state=42)
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Setup the grid search
# grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# # Fit grid search
# grid_search.fit(X_train, y_train)

# # Best parameters and best score
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# best_rf = grid_search.best_estimator_

Load Model

In [None]:
# dump(best_rf, 'model/rf_model.joblib')
best_rf = load('model/rf_model.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Feature Importance

In [None]:
feature_names = X_train_resampled.columns
importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df = feature_importance_df[(feature_importance_df['Importance']) > 0.00005]

In [None]:
important_features = feature_importance_df['Feature'].tolist()

X_train_filtered = X_train_resampled[important_features]
X_test_filtered = X_test[important_features]
X_train_filtered

Unnamed: 0,division,anything,july,criminal law,argued,majority,allegedly,failed,ordinary,iii,...,res judicata,family violence,admiralty and shipping,advice,advice.1,adverse possession,international taxation,hdb flat,courts and jurisdiction,misrepresentation act
0,0.009229,0.000000,0.000000,0,0.005410,0.000000,0.000000,0.005148,0.000000,0.029518,...,0,0,0,0,0.0,0,0,0,0,0
1,0.001328,0.001974,0.000802,0,0.003113,0.002480,0.000000,0.002962,0.012157,0.005308,...,0,0,0,0,0.0,0,0,0,0,0
2,0.000000,0.017659,0.038272,0,0.000000,0.000000,0.006375,0.022086,0.006592,0.000000,...,0,0,0,0,0.0,0,0,0,0,0
3,0.000000,0.001575,0.174046,0,0.000000,0.000000,0.017053,0.009453,0.001763,0.000000,...,0,0,0,0,0.0,0,0,0,0,0
4,0.000000,0.000000,0.000000,0,0.008033,0.000000,0.000000,0.007644,0.000000,0.010957,...,0,0,0,0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8269,0.000000,0.001823,0.017783,0,0.030196,0.002291,0.001975,0.019155,0.004084,0.005884,...,0,0,0,0,0.0,0,0,0,0,0
8270,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0,0,0
8271,0.000000,0.000000,0.005170,0,0.020067,0.000000,0.006889,0.038189,0.007124,0.000000,...,0,0,0,0,0.0,0,0,0,0,0
8272,0.000000,0.000000,0.005254,0,0.010197,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0.0,0,0,0,0,0


In [None]:
# selector = RFE(best_rf, n_features_to_select=1000, step=1)
# selector = selector.fit(X_train_filtered, y_train_resampled)
# dump(selector, 'model/rfe_selector.joblib')
selector = load('model/rfe_selector.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
X_train_reduced = selector.transform(X_train_filtered)
X_test_reduced = selector.transform(X_test_filtered)

Model Training

In [None]:
best_rf.fit(X_train_reduced, y_train_resampled)

In [None]:
y_pred = best_rf.predict(X_test_reduced)
RF_accuracy = accuracy_score(y_test, y_pred)
# Evaluating the Model
print("Accuracy:", RF_accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5912659470068695
Classification Report:
              precision    recall  f1-score   support

  Favourable       0.60      0.95      0.73      1183
  No outcome       0.40      0.03      0.06       238
Unfavourable       0.54      0.11      0.19       617

    accuracy                           0.59      2038
   macro avg       0.51      0.37      0.33      2038
weighted avg       0.56      0.59      0.49      2038



#### Modelling (Multiclass Logistic Regression) in process

In [47]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from statistics import mean

In [48]:
X = processed_df.drop(columns=['target','casename','processed_facts', 'processed_issues'])
y = processed_df['target']

# Handle imbalanced classes
smt = SMOTE(random_state=42)

results = []
for p in [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]:
		print(f'Processing for {p}')
		# create name for model
		key = '%.4f' % p
		if p == 0.0:
			lm = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty=None, max_iter=1000)
		else:
			lm = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=p, max_iter=1000)

		steps = [('over', smt), ('model', lm)]	
		pipeline = Pipeline(steps=steps)
		# evaluate pipeline
  
		#StratifiedKFold is the improved version of KFold
		#KFold is a cross-validator that divides the dataset into k folds. 
  		#Stratified is to ensure that each fold of dataset has the same proportion of observations with a given label.
		cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
		scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
		if p == 0.0:
			results.append(['multi lbfgs','0',key,mean(scores)])
		else:
			results.append(['multi lbfgs','l2',key,mean(scores)])


print("\nHere are the results")
for result in results:
	print('%s %s %s %.3f' % (result[0], result[1], result[2], result[3]))

Processing for 0.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Processing for 0.0001
Processing for 0.001
Processing for 0.01
Processing for 0.1
Processing for 1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Here are the results
multi lbfgs 0 0.0000 0.485
multi lbfgs l2 0.0001 0.275
multi lbfgs l2 0.0010 0.396
multi lbfgs l2 0.0100 0.462
multi lbfgs l2 0.1000 0.493
multi lbfgs l2 1.0000 0.495


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
for p in [0.0001, 0.001, 0.01, 0.1, 1.0]:
		print(f'Processing for {p}')
		# create name for model
		key = '%.4f' % p  
		if p == 0.0:
			lm = LogisticRegression(multi_class='ovr', solver='lbfgs', penalty=None, max_iter=1000)
		else:
			lm = LogisticRegression(multi_class='ovr', solver='lbfgs', penalty='l2', C=p, max_iter=1000)

		steps = [('over', SMOTE()), ('model', lm)]	
		pipeline = Pipeline(steps=steps)
		# evaluate pipeline
		cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
		scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
		if p == 0.0:
			results.append(['ovr lbfgs','0',key,mean(scores)])
		else:
			results.append(['ovr lbfgs','l2',key,mean(scores)])

print("\nHere are the results")
for result in results:
	print('%s %s %s %.3f' % (result[0], result[1], result[2], result[3]))

Processing for 0.0001
Processing for 0.001
Processing for 0.01
Processing for 0.1
Processing for 1.0

Here are the results
multi lbfgs 0 0.0000 0.485
multi lbfgs l2 0.0001 0.275
multi lbfgs l2 0.0010 0.396
multi lbfgs l2 0.0100 0.462
multi lbfgs l2 0.1000 0.493
multi lbfgs l2 1.0000 0.495
ovr lbfgs l2 0.0001 0.259
ovr lbfgs l2 0.0010 0.383
ovr lbfgs l2 0.0100 0.452
ovr lbfgs l2 0.1000 0.499
ovr lbfgs l2 1.0000 0.505


In [50]:
for p in [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]:
		print(f'Processing for {p}')
		# create name for model
		key = '%.4f' % p
		if p == 0.0:
			lm = LogisticRegression(multi_class='multinomial', solver='saga', penalty=None, max_iter=1000)
		else:
			lm = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l1', C=p, max_iter=1000)

		steps = [('over', smt), ('model', lm)]	
		pipeline = Pipeline(steps=steps)
		# evaluate pipeline
		cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
		scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
		if p == 0.0:
			results.append(['multi saga','0',key,mean(scores)])
		else:
			results.append(['multi saga','l1',key,mean(scores)])


print("\nHere are the results")
for result in results:
	print('%s %s %s %.3f' % (result[0], result[1], result[2], result[3]))

Processing for 0.0




Processing for 0.0001
Processing for 0.001
Processing for 0.01
Processing for 0.1


KeyboardInterrupt: 

In [None]:
for p in [0.0001, 0.001, 0.01, 0.1, 1.0]:
		print(f'Processing for {p}')
		# create name for model
		key = '%.4f'  p == 0.0:
			lm = LogisticRegression(multi_class='ovr', solver='liblinear', penalty=None, max_iter=1000)
		else:
			lm = Logistic% p  
		ifRegression(multi_class='ovr', solver='liblinear', penalty='l1', C=p, max_iter=1000)

		steps = [('over', SMOTE()), ('model', lm)]	
		pipeline = Pipeline(steps=steps)
		# evaluate pipeline
		cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
		scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
		if p == 0.0:
			results.append(['ovr liblinear','0',key,mean(scores)])
		else:
			results.append(['ovr liblinear','l1',key,mean(scores)])

print("\nHere are the results")
for result in results:
	print('%s %s %s %.3f' % (result[0], result[1], result[2], result[3]))
		


Here are the results
0 0.0000 0.489
l2 0.0000 0.489
l2 0.0001 0.275
l2 0.0010 0.397
l2 0.0100 0.462
l2 0.1000 0.489
l2 1.0000 0.490
l1 0.0001 0.285
l1 0.0010 0.277
l1 0.0100 0.331
l1 0.1000 0.479
l1 1.0000 0.505


In [None]:
best_lr_accuracy = max(result, key=lambda x: x[3])
print("Maximum acc:", best_lr_accuracy)

#### Overall Evaluation

In [None]:
accuracies = {
    'CNN': CNN_test_accuracy,
    'DNN': DNN_test_accuracy,
    'Random Forest': RF_accuracy,
    'Best Logistic Regression': best_lr_accuracy
}

# Print each model's accuracy for comparison
for model, accuracy in accuracies.items():
    print(f"Accuracy of {model}: {accuracy:.2f}%")

# Identify the best model
best_model = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model]
print(f"\nThe best model is {best_model} with an accuracy of {best_accuracy:.2f}%")
