### Tips from prof

- Narrow scope of work (e.g. court level)

- Could try both binary/multi-class model outcomes and compare the performance 

- Change user from layperson to legal professional (and mention that this project is a stepping stone towards having layperson use the model)

- Link features to predicted outcome (if time permits can try using XGBoost with LIME for model interpretability)

- Can also try to see accuracy of models with different areas of law, lowest accuracy may be hardest area of law to predict


### Data setup

In [44]:
import os
import re
import spacy 
from spacy import displacy
import json
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import nltk
import pyLDAvis
import pyLDAvis.gensim_models

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joelleng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joelleng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [45]:
# Load CSV files into DataFrames
areas_of_law_df = pd.read_csv("data/prediction_data/areas_of_law.csv")
coram_df = pd.read_csv("data/prediction_data/coram.csv")
fact_themes_df = pd.read_csv("data/prediction_data/fact_themes.csv")
sg_legal_cases_df = pd.read_csv("data/prediction_data/sg_legal_cases_dataset.csv")
target_rulings_df = pd.read_csv("data/prediction_data/target_rulings.csv")
# Load the JSON file into a dictionary
with open('data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
processed_df = pd.DataFrame(issues_data)

# Load the JSON file into a dictionary
with open('data/rawish_data/facts.json') as f:
    facts_data = [json.loads(line) for line in f]
raw_facts_df = pd.DataFrame(facts_data)

# Merge DataFrames
merged_df = pd.merge(areas_of_law_df, coram_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, fact_themes_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, sg_legal_cases_df, left_on='casename', right_on='filename', how='outer')
merged_df = pd.merge(merged_df, processed_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, raw_facts_df, on='casename', how='outer')
merged_df = pd.merge(merged_df, target_rulings_df, on='casename', how='outer')

merged_df.drop(columns=['Unnamed: 0'], inplace=True)
merged_df.drop(columns=['filename'], inplace=True)

# Display the resulting DataFrame
print(merged_df.head())

           casename                                        area_of_law  \
0   2000_SGCA_1.pdf  {'civil procedure': ['pleadings'], 'res judica...   
1  2000_SGCA_10.pdf  {'contract': ['formation'], 'equity': ['defenc...   
2  2000_SGCA_11.pdf  {'contract': ['discharge'], 'damages': ['asses...   
3  2000_SGCA_12.pdf  {'courts and jurisdiction': ['court of appeal'...   
4  2000_SGCA_13.pdf                     {'criminal law': ['offences']}   

                                           Coram  themes court_level  \
0  ['Chao Hick Tin , L P Thean , Yong Pung How']     5.0        SGCA   
1   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
2   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
3   ['Chao Hick Tin , Tan Lee Meng , L P Thean']     5.0        SGCA   
4   ['Chao Hick Tin , Lai Kew Chai , L P Thean']    12.0        SGCA   

                                              issues  \
0  The claim was dismissed with costs by the\nHig...   
1  the cla

In [46]:
merged_df = merged_df.dropna()
merged_df.isna().sum()

casename       0
area_of_law    0
Coram          0
themes         0
court_level    0
issues         0
facts          0
target         0
dtype: int64

### Data Preprocessing

In [47]:
nan_counts = merged_df.isna().sum()
print(nan_counts)

#nas are probably those reassigned cases, coram has 7, i just drop them for now
na_target_rows = merged_df[merged_df['target'].isna()]
print(na_target_rows)

merged_df.dropna(axis=0, inplace=True)
print(merged_df.isna().sum())

#remove empty lists
merged_df = merged_df.query("area_of_law != '[]'")

#target is unbalanced
target_counts = merged_df['target'].value_counts()
print(target_counts)

merged_df = merged_df.reset_index(drop=True) # prevent nan values from appearing after one-hot

casename       0
area_of_law    0
Coram          0
themes         0
court_level    0
issues         0
facts          0
target         0
dtype: int64
Empty DataFrame
Columns: [casename, area_of_law, Coram, themes, court_level, issues, facts, target]
Index: []
casename       0
area_of_law    0
Coram          0
themes         0
court_level    0
issues         0
facts          0
target         0
dtype: int64
target
Favourable      3942
Unfavourable    2056
No outcome       795
Name: count, dtype: int64


In [48]:
merged_df['area_of_law'] = merged_df['area_of_law'].apply(ast.literal_eval)
merged_df['Coram'] = merged_df['Coram'].apply(ast.literal_eval)
merged_df.head(3)

Unnamed: 0,casename,area_of_law,Coram,themes,court_level,issues,facts,target
0,2000_SGCA_1.pdf,"{'civil procedure': ['pleadings'], 'res judica...","[Chao Hick Tin , L P Thean , Yong Pung How]",5.0,SGCA,The claim was dismissed with costs by the\nHig...,The facts\nThe appellant is the widow of one T...,Favourable
1,2000_SGCA_10.pdf,"{'contract': ['formation'], 'equity': ['defenc...","[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,the claim and\nagainst that decision this appe...,facts and surrounding circumstances including ...,Favourable
2,2000_SGCA_11.pdf,"{'contract': ['discharge'], 'damages': ['asses...","[Chao Hick Tin , Tan Lee Meng , L P Thean]",5.0,SGCA,The appeal \nThe questions which arise in this...,"Background \nThe first appellants, a French co...",No outcome


### Flatten areas_of_law

In [49]:
all_areas = []

for index, row in merged_df.iterrows():

    areas = row['area_of_law']
    flat_areas = []
    for main_area, sub_areas in areas.items():
        flat_areas.append(main_area)
        for sarea in sub_areas.copy():
            if len(sarea) > 33:
                sub_areas.remove(sarea)
        flat_areas.extend(sub_areas)
    all_areas.append(flat_areas)

In [50]:
for area in all_areas:
    if area == []:
        print(area)

### One hot encoding

In [51]:
# one-hot encode aol
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(all_areas)

binary_aol_df = pd.DataFrame(binary_features, columns=mlb.classes_)
binary_aol_df = binary_aol_df.reset_index(drop=True)
processed_df = pd.concat([merged_df.drop('area_of_law', axis=1), binary_aol_df], axis=1)

print(processed_df.head(3))

           casename                                        Coram  themes  \
0   2000_SGCA_1.pdf  [Chao Hick Tin , L P Thean , Yong Pung How]     5.0   
1  2000_SGCA_10.pdf   [Chao Hick Tin , Tan Lee Meng , L P Thean]     5.0   
2  2000_SGCA_11.pdf   [Chao Hick Tin , Tan Lee Meng , L P Thean]     5.0   

  court_level                                             issues  \
0        SGCA  The claim was dismissed with costs by the\nHig...   
1        SGCA  the claim and\nagainst that decision this appe...   
2        SGCA  The appeal \nThe questions which arise in this...   

                                               facts      target  \
0  The facts\nThe appellant is the widow of one T...  Favourable   
1  facts and surrounding circumstances including ...  Favourable   
2  Background \nThe first appellants, a French co...  No outcome   

   "a larger sum being repaid"  "abet"  "an interest in any matter"  ...  \
0                            0       0                            0  ... 

In [52]:
print(processed_df.isna().sum())
# processed_df = processed_df.dropna()

casename                         0
Coram                            0
themes                           0
court_level                      0
issues                           0
                                ..
young offenders]                 0
“any claim  hereunder”           0
“any fire accidentally begin”    0
“charity proceedings”            0
“rash” and “negligent”           0
Length: 1378, dtype: int64


In [53]:
processed_df = processed_df[processed_df['Coram'].apply(lambda x: isinstance(x, list))]

In [54]:
# one-hot encode coram
mlb = MultiLabelBinarizer()
binary_features = mlb.fit_transform(processed_df['Coram'])

binary_coram_df = pd.DataFrame(binary_features, columns=mlb.classes_)
binary_coram_df = binary_coram_df.reset_index(drop=True)
processed_df = pd.concat([processed_df.drop('Coram', axis=1), binary_coram_df], axis=1)

print(processed_df.head())

           casename  themes court_level  \
0   2000_SGCA_1.pdf     5.0        SGCA   
1  2000_SGCA_10.pdf     5.0        SGCA   
2  2000_SGCA_11.pdf     5.0        SGCA   
3  2000_SGCA_12.pdf     5.0        SGCA   
4  2000_SGCA_13.pdf    12.0        SGCA   

                                              issues  \
0  The claim was dismissed with costs by the\nHig...   
1  the claim and\nagainst that decision this appe...   
2  The appeal \nThe questions which arise in this...   
3  the appeals from the assistant registrar. In h...   
4  the appeal on 24 January 2000 and dismissed it...   

                                               facts        target  \
0  The facts\nThe appellant is the widow of one T...    Favourable   
1  facts and surrounding circumstances including ...    Favourable   
2  Background \nThe first appellants, a French co...    No outcome   
3  Background\nMicrosoft, Adobe and Autodesk are ...  Unfavourable   
4  facts. Mere assertion would not suffice. In ex...  

In [55]:
print(processed_df.isna().sum())

casename             0
themes               0
court_level          0
issues               0
facts                0
                    ..
Wong Li Kok, Alex    0
Woo Bih Li           0
Yeong Zee Kin SAR    0
Yong Pung How        0
Zhuo Wenzhao AR      0
Length: 1597, dtype: int64


### Topic Modelling

#### Helper functions for Topic Modelling 

In [56]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\W*\b(?!no)\w{1,2}\b', '', text)
    stop_words = set(stopwords.words('english'))
    legal_stopwords = ('appellant', 'respondent', 'plaintiff', 'defendant', 'mr', 'mrs', 'dr', 'mdm', 'court','version', 'hr', 'would', 'case', 'sghc', 'court', 'sgca', 'slr', 'sgdc', 'also', 'first', 'person', 'statement', 'line', 'para', 'fact', 'one', 'may', 'time', 'could', 'next', 'legal', 'issues', 'issue')
    stop_words.update(legal_stopwords)
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return words

#### Topic Modelling For Facts

In [57]:
data = []

processed_df['processed_facts'] = processed_df['facts'].apply(preprocess_text)
processed_df.drop(columns=['facts'], inplace=True)
print(processed_df["processed_facts"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for fact in processed_df['processed_facts']:
    for word in fact:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# print(lda_model.show_topics())

# best_coherence = -1
# best_lda = None
# for num_topics in range(5, 31, 5):
#     # Train LDA model
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=dictionary,
#                                                 num_topics=num_topics,
#                                                 random_state=42)
    
#     # Compute coherence score
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
#     coherence_score = coherence_model_lda.get_coherence()
    
#     print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
#     if coherence_score > best_coherence:
#         best_coherence = coherence_score
#         best_topic = num_topics
# print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 10 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=10,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)

processed_df['facts_topic'] = topics
processed_df = processed_df.reset_index(drop=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [fact, widow, tan, geok, tee, deceased, sue, c...
1       [fact, surrounding, circumstance, including, a...
2       [background, appellant, french, company, secon...
3       [background, microsoft, adobe, autodesk, compa...
4       [fact, mere, assertion, suffice, exh, said, st...
                              ...                        
6788    [fact, accused, low, sze, song, low, year, old...
6789    [fact, giving, opinion, representation, amount...
6790    [fact, party, karan, bagga, litigant, proceedi...
6791                                                   []
6792    [fact, party, towa, company, incorporated, jap...
Name: processed_facts, Length: 6793, dtype: object
Finished preprocessing text
Performing topic modelling


In [58]:
print(processed_df.isna().sum())

casename             0
themes               0
court_level          0
issues               0
target               0
                    ..
Yeong Zee Kin SAR    0
Yong Pung How        0
Zhuo Wenzhao AR      0
processed_facts      0
facts_topic          0
Length: 1598, dtype: int64


#### Topic Modelling For Issues

In [59]:
data = []

processed_df['processed_issues'] = processed_df['issues'].apply(preprocess_text)
processed_df.drop(columns=['issues'], inplace=True)
print(processed_df["processed_issues"])
print("Finished preprocessing text")

texts, article = [], []
print("Performing topic modelling")
for issue in processed_df['processed_issues']:
    for word in issue:
        article.append(word)
        
    texts.append(article)
    article = []

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# print(lda_model.show_topics())

# best_coherence = -1
# best_lda = None
# for num_topics in range(5, 31, 5):
#     # Train LDA model
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=dictionary,
#                                                 num_topics=num_topics,
#                                                 random_state=42)
#     # Compute coherence score
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
#     coherence_score = coherence_model_lda.get_coherence()
    
#     print(f"Number of topics: {num_topics}, Coherence Score: {coherence_score}")
#     if coherence_score > best_coherence:
#         best_coherence = coherence_score
#         best_topic = num_topics
# print(f"Best no of topic: {best_topic} and Best Coherence Score: {best_coherence}")

#use the best model (result from above codes: 25 topics)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=25,
                                                random_state=42)

#inspiration from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
topics_matrix = lda_model[corpus]
topics = []
# Iterate over each document's topic distribution
# Get the topic with the highest probability
for doc in topics_matrix:
    topic = max(doc, key=lambda x: x[1])[0]
    topics.append(topic)
    
processed_df['issues_topic'] = topics
processed_df = processed_df.reset_index(drop=True)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

0       [claim, dismissed, cost, high, decision, fook,...
1       [claim, decision, appeal, brought, background,...
2       [appeal, question, arise, appeal, follows, app...
3       [appeal, assistant, registrar, ground, judgmen...
4       [appeal, january, dismissed, give, reason, evi...
                              ...                        
6788    [sub, arise, consideration, whether, low, siva...
6789    [claim, conspiracy, defraud, fault, ken, sally...
6790    [relating, defence, justification, qualified, ...
6791                                                   []
6792    [background, dispute, towa, commenced, suit, a...
Name: processed_issues, Length: 6793, dtype: object
Finished preprocessing text
Performing topic modelling


In [60]:
processed_df.drop(columns=['themes'], inplace=True)
processed_df

Unnamed: 0,casename,court_level,target,"""a larger sum being repaid""","""abet""","""an interest in any matter""","""any person""","""appeal""","""arising out of""","""available market""",...,Vinodh Coomaraswamy (as he then was),"Wong Li Kok, Alex",Woo Bih Li,Yeong Zee Kin SAR,Yong Pung How,Zhuo Wenzhao AR,processed_facts,facts_topic,processed_issues,issues_topic
0,2000_SGCA_1.pdf,SGCA,Favourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, widow, tan, geok, tee, deceased, sue, c...",8,"[claim, dismissed, cost, high, decision, fook,...",19
1,2000_SGCA_10.pdf,SGCA,Favourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, surrounding, circumstance, including, a...",2,"[claim, decision, appeal, brought, background,...",7
2,2000_SGCA_11.pdf,SGCA,No outcome,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[background, appellant, french, company, secon...",0,"[appeal, question, arise, appeal, follows, app...",0
3,2000_SGCA_12.pdf,SGCA,Unfavourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[background, microsoft, adobe, autodesk, compa...",3,"[appeal, assistant, registrar, ground, judgmen...",12
4,2000_SGCA_13.pdf,SGCA,Unfavourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, mere, assertion, suffice, exh, said, st...",1,"[appeal, january, dismissed, give, reason, evi...",23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6788,2023_SGHC_95.pdf,SGHC,No outcome,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, accused, low, sze, song, low, year, old...",6,"[sub, arise, consideration, whether, low, siva...",2
6789,2023_SGHC_96.pdf,SGHC,Unfavourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, giving, opinion, representation, amount...",7,"[claim, conspiracy, defraud, fault, ken, sally...",9
6790,2023_SGHC_97.pdf,SGHC,Unfavourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,"[fact, party, karan, bagga, litigant, proceedi...",3,"[relating, defence, justification, qualified, ...",12
6791,2023_SGHC_98.pdf,SGHC,Favourable,0,0,0,0,0,0,0,...,0,0,0,0,0,0,[],0,[],0


### splitting

In [61]:
print(processed_df)

              casename court_level        target  "a larger sum being repaid"  \
0      2000_SGCA_1.pdf        SGCA    Favourable                            0   
1     2000_SGCA_10.pdf        SGCA    Favourable                            0   
2     2000_SGCA_11.pdf        SGCA    No outcome                            0   
3     2000_SGCA_12.pdf        SGCA  Unfavourable                            0   
4     2000_SGCA_13.pdf        SGCA  Unfavourable                            0   
...                ...         ...           ...                          ...   
6788  2023_SGHC_95.pdf        SGHC    No outcome                            0   
6789  2023_SGHC_96.pdf        SGHC  Unfavourable                            0   
6790  2023_SGHC_97.pdf        SGHC  Unfavourable                            0   
6791  2023_SGHC_98.pdf        SGHC    Favourable                            0   
6792  2023_SGHC_99.pdf        SGHC    Favourable                            0   

      "abet"  "an interest 

In [62]:
print(processed_df.isna().sum())

casename                       0
court_level                    0
target                         0
"a larger sum being repaid"    0
"abet"                         0
                              ..
Zhuo Wenzhao AR                0
processed_facts                0
facts_topic                    0
processed_issues               0
issues_topic                   0
Length: 1598, dtype: int64


In [372]:
X = processed_df.drop(columns=['target'])
y = processed_df['target']

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, remaining_index in stratified_split.split(X, y):
    X_train, X_test_val = X.iloc[train_index], X.iloc[remaining_index]
    y_train, y_test_val = y.iloc[train_index], y.iloc[remaining_index]

#balanced dataset (target variable was imbalanced Favourable 5006 Unfavourable 2523 No outcome 984)
#randomly found one online, can be changed -> need to check am i doing this right 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

#split further from X_test_val into X_val and X_test
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42, stratify=y_test_val)

### Feature Engineering

In [373]:
## One hot encoding
## vector embedding
## pipeline
X_train_resampled['SGCA'] = X_train_resampled['court_level'].apply(lambda x: 1 if x == 'SGCA' else 0)
X_train_resampled['SGHC'] = X_train_resampled['court_level'].apply(lambda x: 1 if x == 'SGHC' else 0)

X_test['SGCA'] = X_test['court_level'].apply(lambda x: 1 if x == 'SGCA' else 0)
X_test['SGHC'] = X_test['court_level'].apply(lambda x: 1 if x == 'SGHC' else 0)

X_val['SGCA'] = X_val['court_level'].apply(lambda x: 1 if x == 'SGCA' else 0)
X_val['SGHC'] = X_val['court_level'].apply(lambda x: 1 if x == 'SGHC' else 0)

X_train_resampled = X_train_resampled.drop(columns=['court_level', 'processed_facts', 'processed_issues'])
X_test = X_test.drop(columns=['court_level', 'processed_facts', 'processed_issues'])
X_val = X_val.drop(columns=['court_level', 'processed_facts', 'processed_issues'])


  X_train_resampled['SGCA'] = X_train_resampled['court_level'].apply(lambda x: 1 if x == 'SGCA' else 0)
  X_train_resampled['SGHC'] = X_train_resampled['court_level'].apply(lambda x: 1 if x == 'SGHC' else 0)


In [374]:
# Perform modelling
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
import tensorflow as tf

class Args:
  epochs = 20
  lr = 0.001
  use_cuda=False
  gamma = 0.7
  log_interval = 10
  seed = 1

args = Args()

device = torch.device("cuda" if args.use_cuda else "cpu")

### Modeling

In [385]:
class Net(nn.Module):  # Defines a new neural network architecture as a class that inherits from the PyTorch base class nn.Module.
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, 3, 1,1, bias=True)
        # Define the first 1D convolution layer. Takes 1 input channel, outputs 32 channels, kernel size is 3, stride is 1, padding is 1.
        self.Bn1 = nn.BatchNorm1d(64)
        # Apply Batch Normalization to the output of the first convolutional layer.
        self.dropout = nn.Dropout(0.3)
        self.pool1 = nn.AvgPool1d(kernel_size=2, stride=2)
        # Apply 1D Average Pooling after the first Batch Normalization. The kernel size and stride are 2.

        self.conv2 = nn.Conv1d(64, 64, 3, 1,1, bias=True)
        self.Bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.AvgPool1d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv1d(64, 64, 3, 1,1, bias=True)
        self.Bn3 = nn.BatchNorm1d(64)
        self.pool3 = nn.AvgPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(12736, 3, bias=True)


    def forward(self, x):
        x = F.relu(self.Bn1(self.conv1(x)))
        # Pass the input through the first convolutional layer, then Batch Normalization, and then apply ReLU activation.
        x = self.dropout(x)
        x = self.pool1(x)
        # Apply Average Pooling to the output of the previous step.
        x = F.relu(self.Bn2(self.conv2(x)))
        x = self.dropout(x)
        x = self.pool2(x)

        x = F.relu(self.Bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool3(x)
        
        x = torch.flatten(x, 1)
        # Flatten the output from the previous step. This is necessary because fully connected layers expect a 1D input.
        x = self.fc1(x)
        # Pass the flattened output through the fully connected layer. This is the output of the network.
        return x
    
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()  # Set the model to training mode

    for batch_idx, (data, target) in enumerate(train_loader):  # Loop over each batch from the training set
        data, target = data.to(device), target.to(device)  # Move the data to the device that is used

        target = target.long()  # Make sure that target data is long type (necessary for loss function)

        optimizer.zero_grad()  # Clear gradients from the previous training step
        output = model(data)  # Run forward pass (model predictions)

        loss = F.cross_entropy(output, target)  # Calculate the loss between the output and target
        loss.backward()  # Perform backpropagation (calculate gradients of loss w.r.t. parameters)
        optimizer.step()  # Update the model parameters

        if batch_idx % args.log_interval == 0:  # Print log info for specified interval
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_loader.dataset),100. * batch_idx / len(train_loader), loss.item()))



def test(model, device, test_loader):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0

    with torch.no_grad():  # Deactivates autograd, reduces memory usage and speeds up computations
        for data, target in test_loader:  # Loop over each batch from the testing set
            
            data, target = data.to(device), target.to(device)  # Move the data to the device that is used

            target = target.long()  # Convert target to long after adjusting value
            output = model(data)  # Run forward pass (model predictions)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # Sum up the batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability as the predicted output
            print(f"Pred: {pred}")
            correct += pred.eq(target.view_as(pred)).sum().item()  # Count correct predictions

    test_loss /= len(test_loader.dataset)  # Calculate the average loss

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, correct, len(test_loader.dataset),100. * correct / len(test_loader.dataset)))
    return correct  # Return the number of correctly classified samples


In [376]:
X_train_resampled = X_train_resampled.iloc[:, 1:1596].copy()
X_train_resampled = torch.tensor(X_train_resampled.values, dtype=torch.float32).to(device)

In [377]:
print(f'Shape of X_train_resampled: {X_train_resampled.shape}')

Shape of X_train_resampled: torch.Size([8277, 1595])


In [378]:
X_test = X_test.iloc[:, 1:1596].copy()
X_test = torch.tensor(X_test.values, dtype=torch.float32).to(device)
print(f'Shape of X_test: {X_test.shape}')

Shape of X_test: torch.Size([1019, 1595])


In [379]:
X_val = X_val.iloc[:, 1:1596].copy()
X_val = torch.tensor(X_val.values, dtype=torch.float32).to(device)
print(f'Shape of X_val: {X_val.shape}')

Shape of X_val: torch.Size([1019, 1595])


In [380]:
mapping = {'Favourable': 2, 'Unfavourable': 0, 'No outcome':1}

y_train_resampled, y_test, y_val = y_train_resampled.copy().map(mapping), y_test.copy().map(mapping), y_val.copy().map(mapping)

y_train_resampled, y_test, y_val = torch.tensor(y_train_resampled.values).to(device), torch.tensor(y_test.values).to(device), torch.tensor(y_val.values).to(device)

In [381]:
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0],1,X_train_resampled.shape[1])
X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])
X_val = X_val.reshape(X_val.shape[0],1,X_val.shape[1])



In [386]:
print(X_train_resampled.shape)
torch.manual_seed(args.seed)

model = Net().to(device)

for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

#Form training and testing dataset
optimizer = optim.Adam(model.parameters(), lr=args.lr)

train_dataset = torch.utils.data.TensorDataset(X_train_resampled, y_train_resampled)
test_dataset = torch.utils.data.TensorDataset(X_val, y_val)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

#Model training
ACC = 0
for epoch in range(1, args.epochs + 1):
    train(args, model, device, train_loader, optimizer, epoch)
    ACC_ = test(model, device, test_loader)
    if ACC_>ACC or ACC_ == ACC:
        ACC = ACC_
        torch.save(model.state_dict(), "Baseline_CNN.pt")

    scheduler.step()

print(ACC)


torch.Size([8277, 1, 1595])
conv1.weight 	 torch.Size([64, 1, 3])
conv1.bias 	 torch.Size([64])
Bn1.weight 	 torch.Size([64])
Bn1.bias 	 torch.Size([64])
Bn1.running_mean 	 torch.Size([64])
Bn1.running_var 	 torch.Size([64])
Bn1.num_batches_tracked 	 torch.Size([])
conv2.weight 	 torch.Size([64, 64, 3])
conv2.bias 	 torch.Size([64])
Bn2.weight 	 torch.Size([64])
Bn2.bias 	 torch.Size([64])
Bn2.running_mean 	 torch.Size([64])
Bn2.running_var 	 torch.Size([64])
Bn2.num_batches_tracked 	 torch.Size([])
conv3.weight 	 torch.Size([64, 64, 3])
conv3.bias 	 torch.Size([64])
Bn3.weight 	 torch.Size([64])
Bn3.bias 	 torch.Size([64])
Bn3.running_mean 	 torch.Size([64])
Bn3.running_var 	 torch.Size([64])
Bn3.num_batches_tracked 	 torch.Size([])
fc1.weight 	 torch.Size([3, 12736])
fc1.bias 	 torch.Size([3])

Test set: Average loss: 1.0010, Accuracy: 443/1019 (43%)


Test set: Average loss: 1.0585, Accuracy: 434/1019 (43%)


Test set: Average loss: 1.0710, Accuracy: 431/1019 (42%)


Test set: Avera

### Evaluation

In [306]:
## Perform Evaluation