In [3]:
#%pip install -U sentence-transformers
#!pip install -U matplotlib

# Import relevant libraries
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot  as plt
import pandas as pd
import os
from tqdm.auto import tqdm
import pickle
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Example sentences
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog.",
    "This is a CS3244 Machine Learning Project.",
    "This is a SBERT Model, trying to do a document embedding"
]

# Example text
text = "This framework generates embeddings for each input sentence. Sentences are passed as a list of string. The quick brown fox jumps over the lazy dog. This is a CS3244 Machine Learning Project. This is a SBERT Model, trying to do a document embedding"


def generate_document_embedding(text):
    try:
        # Split paragraph into sentences
        sentences = text.split('.')

        # Remove empty strings
        sentences = [s.strip() for s in sentences if s.strip()]

        # Sentences are encoded by calling model.encode()
        embeddings = model.encode(sentences)

        # for sentence, embedding in zip(sentences, embeddings):
        #     print("Sentence:", sentence)
        #     print("Embedding:", embedding.size)

        # Aggregate sentence embeddings (simple averaging)
        paragraph_embedding = np.mean(embeddings, axis=0)

        # Optional: Normalize the final embedding
        paragraph_embedding /= np.linalg.norm(paragraph_embedding)

        # Use the paragraph_embedding for further tasks
        #print(paragraph_embedding)
        return paragraph_embedding
    except Exception as e:
        print("Error processing text:", e)
        return None




In [9]:
sbert_df = pd.read_csv(r'C:\Users\joel-\Downloads\Telegram Desktop\fact_with_outcome.csv')
sbert_df.head()

Unnamed: 0,year_court_caseid,text,Outcome
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed
3,2000_SGHC_5,1. The plaintiffs are a ship-owning\r\n\r\ncom...,Outcome not explicitly mentioned
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed


In [14]:
sbert_df = sbert_df.dropna(subset=['text'])
sbert_df.text.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)
sbert_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sbert_df.text.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)


Unnamed: 0,year_court_caseid,text,Outcome
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed
3,2000_SGHC_5,1. The plaintiffs are a ship-owningcompany inc...,Outcome not explicitly mentioned
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed


In [15]:
sbert_df['SBERT'] = sbert_df['text'].progress_apply(lambda x: generate_document_embedding(x))
sbert_df.to_csv("sbert_embeddings_checkpoint.csv", index=False)
sbert_df.head()

100%|██████████| 2323/2323 [23:40:22<00:00, 36.69s/it]       


Unnamed: 0,year_court_caseid,text,Outcome,SBERT
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,"[-0.13043316, 0.09703684, 0.038971096, 0.03582..."
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,"[-0.055974834, 0.10519265, 0.03073486, -0.0026..."
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,"[-0.05205216, 0.1028741, -0.021516616, -0.0419..."
3,2000_SGHC_5,1. The plaintiffs are a ship-owningcompany inc...,Outcome not explicitly mentioned,"[-0.062574126, 0.060851052, -0.051644053, -0.0..."
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,"[-0.02451861, 0.112310335, 0.0063367113, -0.00..."


In [16]:
df = pd.read_csv("sbert_embeddings_checkpoint.csv")
df.head()

Unnamed: 0,year_court_caseid,text,Outcome,SBERT
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,[-1.30433157e-01 9.70368385e-02 3.89710963e-...
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,[-5.59748337e-02 1.05192646e-01 3.07348594e-...
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,[-5.20521589e-02 1.02874100e-01 -2.15166155e-...
3,2000_SGHC_5,1. The plaintiffs are a ship-owningcompany inc...,Outcome not explicitly mentioned,[-6.25741258e-02 6.08510524e-02 -5.16440533e-...
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,[-2.45186109e-02 1.12310335e-01 6.33671135e-...


In [17]:
df = df.dropna(subset=['text'])

In [18]:
df = df[df['Outcome'] != 'Outcome not explicitly mentioned']
df.head()

Unnamed: 0,year_court_caseid,text,Outcome,SBERT
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,[-1.30433157e-01 9.70368385e-02 3.89710963e-...
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,[-5.59748337e-02 1.05192646e-01 3.07348594e-...
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,[-5.20521589e-02 1.02874100e-01 -2.15166155e-...
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,[-2.45186109e-02 1.12310335e-01 6.33671135e-...
5,2000_SGHC_12,1. The Plaintiffs are property developers. The...,Order accordingly,[-5.84279895e-02 8.58546048e-02 -2.78009158e-...


In [19]:
df.loc[df['Outcome'] == 'Order accordingly', 'Outcome'] = 'Appeal allowed'
df.head()

Unnamed: 0,year_court_caseid,text,Outcome,SBERT
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,[-1.30433157e-01 9.70368385e-02 3.89710963e-...
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,[-5.59748337e-02 1.05192646e-01 3.07348594e-...
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,[-5.20521589e-02 1.02874100e-01 -2.15166155e-...
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,[-2.45186109e-02 1.12310335e-01 6.33671135e-...
5,2000_SGHC_12,1. The Plaintiffs are property developers. The...,Appeal allowed,[-5.84279895e-02 8.58546048e-02 -2.78009158e-...


In [20]:
df['Outcome'].value_counts()

Outcome
Appeal dismissed    951
Appeal allowed      869
Name: count, dtype: int64

In [21]:
df.head()

Unnamed: 0,year_court_caseid,text,Outcome,SBERT
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,[-1.30433157e-01 9.70368385e-02 3.89710963e-...
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,[-5.59748337e-02 1.05192646e-01 3.07348594e-...
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,[-5.20521589e-02 1.02874100e-01 -2.15166155e-...
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,[-2.45186109e-02 1.12310335e-01 6.33671135e-...
5,2000_SGHC_12,1. The Plaintiffs are property developers. The...,Appeal allowed,[-5.84279895e-02 8.58546048e-02 -2.78009158e-...


In [25]:
df2 = df[['year_court_caseid', 'SBERT']]
df2.head()
df2

Unnamed: 0,year_court_caseid,SBERT
0,2000_SGHC_1,[-1.30433157e-01 9.70368385e-02 3.89710963e-...
1,2000_SGHC_2,[-5.59748337e-02 1.05192646e-01 3.07348594e-...
2,2000_SGHC_4,[-5.20521589e-02 1.02874100e-01 -2.15166155e-...
4,2000_SGHC_9,[-2.45186109e-02 1.12310335e-01 6.33671135e-...
5,2000_SGHC_12,[-5.84279895e-02 8.58546048e-02 -2.78009158e-...
...,...,...
2317,2015_SGCA_63,[-4.68444899e-02 1.36574641e-01 -9.34117008e-...
2319,2015_SGCA_68,[-5.99841028e-02 1.08868137e-01 1.55377686e-...
2320,2015_SGCA_69,[-1.97583530e-02 7.80741423e-02 3.90850380e-...
2321,2015_SGCA_70,[-2.92920154e-02 1.34911403e-01 -3.41241322e-...


In [26]:
df2.to_csv('SBERT_embeddings.csv', index=False)

In [7]:
df3 = pd.read_csv('extracted_data_w2v.csv')
df3.head()

Unnamed: 0,year_court_caseid,text,Outcome,outcome,Word2Vec
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,Appeal allowed,"[-0.08911123585223103, 0.13744355276408474, 0...."
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,Appeal dismissed,"[-0.11913180440870731, 0.10448532442864904, 0...."
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,Appeal dismissed,"[-0.02975214201303471, 0.11394181690479083, 0...."
3,2000_SGHC_5,1. The plaintiffs are a ship-owning\r\n\r\ncom...,Outcome not explicitly mentioned,Outcome not explicitly mentioned,"[-0.126107727909874, 0.09291261608194397, 0.04..."
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,Appeal dismissed,"[-0.058357461182548934, 0.07636239769481189, -..."


In [8]:
df3 = df3[['year_court_caseid', 'text', 'Outcome', 'Word2Vec']]
df3.head()

Unnamed: 0,year_court_caseid,text,Outcome,Word2Vec
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,"[-0.08911123585223103, 0.13744355276408474, 0...."
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,"[-0.11913180440870731, 0.10448532442864904, 0...."
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,"[-0.02975214201303471, 0.11394181690479083, 0...."
3,2000_SGHC_5,1. The plaintiffs are a ship-owning\r\n\r\ncom...,Outcome not explicitly mentioned,"[-0.126107727909874, 0.09291261608194397, 0.04..."
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,"[-0.058357461182548934, 0.07636239769481189, -..."


In [9]:
df3 = df3.dropna(subset=['text'])

In [10]:
df3 = df3[df3['Outcome'] != 'Outcome not explicitly mentioned']
df3.head()

Unnamed: 0,year_court_caseid,text,Outcome,Word2Vec
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,"[-0.08911123585223103, 0.13744355276408474, 0...."
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,"[-0.11913180440870731, 0.10448532442864904, 0...."
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,"[-0.02975214201303471, 0.11394181690479083, 0...."
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,"[-0.058357461182548934, 0.07636239769481189, -..."
5,2000_SGHC_12,1. The Plaintiffs are property developers. The...,Order accordingly,"[0.0007421757429649399, 0.0034368738129528908,..."


In [11]:
df3.loc[df3['Outcome'] == 'Order accordingly', 'Outcome'] = 'Appeal allowed'
df3.head()

Unnamed: 0,year_court_caseid,text,Outcome,Word2Vec
0,2000_SGHC_1,"Before going into the facts proper, a summary ...",Appeal allowed,"[-0.08911123585223103, 0.13744355276408474, 0...."
1,2000_SGHC_2,The plaintiffs were originally known as Lian H...,Appeal dismissed,"[-0.11913180440870731, 0.10448532442864904, 0...."
2,2000_SGHC_4,The appellant was a member of the club and a r...,Appeal dismissed,"[-0.02975214201303471, 0.11394181690479083, 0...."
4,2000_SGHC_9,Although several other procedural points were ...,Appeal dismissed,"[-0.058357461182548934, 0.07636239769481189, -..."
5,2000_SGHC_12,1. The Plaintiffs are property developers. The...,Appeal allowed,"[0.0007421757429649399, 0.0034368738129528908,..."


In [13]:
df3 = df3[['year_court_caseid', 'Word2Vec']]

In [15]:
df3.to_csv('Word2Vec_embeddings.csv', index=False)

In [16]:
df4 = pd.read_csv('tf_idf.csv')
df4

Unnamed: 0,year_court_caseid,TF_IDF Vector
0,2000_SGHC_1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2000_SGHC_2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2000_SGHC_4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2000_SGHC_9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2000_SGHC_12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
1815,2015_SGCA_62,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1816,2015_SGCA_63,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1817,2015_SGCA_68,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00211553..."
1818,2015_SGCA_69,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
