In [12]:
import pandas as pd
df = pd.read_csv('SupremeCourt_cases710.csv')
df.head()

Unnamed: 0,Case Title,Link,Case Content
0,The State Of Tamil Nadu vs The Governor Of Tam...,https://indiankanoon.org/docfragment/82729634/...,Take notes as you read a judgment using ourVir...
1,Independent Sugar Corporation Limited vs Giris...,https://indiankanoon.org/docfragment/117249167...,Take notes as you read a judgment using ourVir...
2,Piramal Capital And Housing Finance ... vs 63 ...,https://indiankanoon.org/docfragment/190999006...,Take notes as you read a judgment using ourVir...
3,In Re Recruitment Of Visually Impaired ... vs ...,https://indiankanoon.org/docfragment/158218833...,Take notes as you read a judgment using ourVir...
4,Union Of India vs Future Gaming Solutions P.Lt...,https://indiankanoon.org/docfragment/117744026...,Take notes as you read a judgment using ourVir...


In [14]:
df['Link'][8]

'https://indiankanoon.org/docfragment/118192392/?formInput=doctypes%3A%20supremecourt%20year%3A%202025'

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

legal_terms = {'herein', 'thereof', 'whereof', 'hereto', 'therein'}
stop_words = set(stopwords.words('english')) - legal_terms
lemmatizer = WordNetLemmatizer()

cleaned = df[['Case Title', 'Link']].copy()

def clean_text(text):

    text = str(text)

    text = text.lower()

    citations = re.findall(r'air\s+\d{4}\s+\w+\s+\d+', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    sentences = nltk.sent_tokenize(text)

    cleaned_sentences = []
    for sent in sentences:
        tokens = nltk.word_tokenize(sent)
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        cleaned_sentences.append(' '.join(tokens))
    cleaned_text = ' '.join(cleaned_sentences)

    if citations:
        cleaned_text += ' ' + ' '.join(citations)
    return cleaned_text

df["Case Content"] = df["Case Content"].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanju\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vanju\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vanju\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanju\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df.head(3)

Unnamed: 0,Case Title,Link,Case Content
0,The State Of Tamil Nadu vs The Governor Of Tam...,https://indiankanoon.org/docfragment/82729634/...,take note read judgment using ourvirtual legal...
1,Independent Sugar Corporation Limited vs Giris...,https://indiankanoon.org/docfragment/117249167...,take note read judgment using ourvirtual legal...
2,Piramal Capital And Housing Finance ... vs 63 ...,https://indiankanoon.org/docfragment/190999006...,take note read judgment using ourvirtual legal...


In [6]:
category_keywords = {
    # LABOUR MATTERS
    'Dismissal': ['termination', 'misconduct', 'disciplinary action', 'show cause notice', 'natural justice violation', 'departmental inquiry', 'charge sheet', 'termination benefits'],
    'Retrenchment': ['retrenchment compensation', 'section 25N', 'layoff', 'closure', 'retrenchment notice', 'industrial disputes act 1947', 'compensation calculation', 'last come first go', 'government permission', 'retrenchment approval'],
    'Contract Labour': ['contract labour regulation', 'principal employer', 'licensing contractor', 'abolition notification', 'sham contract', 'tripartite agreement', 'labour commissioner', 'contract worker benefits', 'section 10', 'direct employment'],
    'Matters relating to wages': ['minimum wages', 'wage revision', 'dearness allowance', 'pay parity', 'arrears calculation', 'wage board', 'equal remuneration', 'basic pay', 'allowances dispute', 'wage regularization'],
    'Workmen Compensation Act': ['employment injury schedule iii','section 3 employer liability','occupational disease notification','disablement percentage assessment','compensation commissioner appeal','dependency benefits calculation','employer liability insurance','accident arising employment','mesothelioma compensation','silicosis diagnosis report'],
    'ESI': ['employees state insurance', 'esi contribution', 'medical benefit', 'disability benefit', 'dependent benefit', 'esi corporation', 'section 46', 'insurable employment', 'registration certificate', 'benefit period'],
    'Factory Act': ['factory license', 'occupier liability', 'hazardous process', 'section 7A', 'annual leave', 'health measures', 'safety officer', 'canteen facilities', 'creche facility', 'dangerous machinery'],
    'Industrial Employment (Standing Order)': ['standing order certification', 'service rules', 'shift working', 'attendance', 'leave rules', 'classification of workers', 'suspension pending inquiry', 'holiday list', 'termination procedure', 'grievance redressal'],
    'Payment of Gratuity Act': ['gratuity calculation', 'section 4', 'forfeiture of gratuity', 'gratuity eligibility', 'employer default', 'gratuity recovery', 'superannuation'],
    'Trade Unions Act': ['trade union registration', 'section 8', 'unfair labor practice', 'recognition dispute', 'union elections', 'membership verification', 'collective bargaining', 'union funds', 'protected workmen', 'union rivalry'],

    # RENT ACT MATTERS
    'Eviction matters of personal necessity': ['bona fide need', 'self-occupation', 'family expansion', 'alternative accommodation', 'comparative hardship', 'landlord requirement', 'residential purpose', 'dependent family', 'business expansion', 'medical necessity'],
    'Eviction matters for re-building': ['structural alteration', 'demolition notice', 'municipal notice', 'reconstruction plan', 'architect certificate', 'building stability', 'redevelopment agreement', 'tenant relocation', 'construction permission', 'completion bond'],
    'Eviction matters of sub-letting': ['unauthorized occupant', 'sub-tenant', 'parting possession', 'license agreement', 'tenancy rights', 'rent receipt', 'tenant verification', 'lock-out period', 'lease violation', 'third party possession'],
    'Arrears of rent': ['rent default', 'monthly rent', 'standard rent', 'rent enhancement', 'deposit scheme', 'tenant default', 'arrears calculation', 'interest on arrears', 'money decree', 'rent receipt'],
    'Enhancement of rent': ['fair rent', 'market rate', 'rent control act', 'cost index', 'capital value', 'improvement cost', 'section 6', 'prevailing rent', 'landlord application', 'tenant objection'],

    # DIRECT TAXES MATTER
    'Income Tax Reference': ['substantial question of law', 'section 256', 'high court reference', 'tax tribunal order', 'case stated', 'appellate reference', 'tax evasion', 'assessment year', 'revision petition', 'tax deduction'],
    'Wealth Tax Act': ['net wealth', 'asset valuation', 'urban land','unproductive assets', 'valuation date', 'wealth statement', 'exempt assets', 'jewellery valuation', 'section 17 wealth tax','wealth tax return'],
    'Capital Gains': ['capital asset', 'transfer u/s 2(47)', 'indexation benefit', 'long term gain', 'cost inflation index', 'section 54', 'exemption claim', 'sale consideration', 'capital gains bond', 'stamp duty value'],
    'Re-assessment': ['reopening assessment', 'section 147', 'income escapement', 'reasons recorded', 'change of opinion', 'fresh material', 'limitation period', 'notice validity', 'concealment income', 'undisclosed investment'],
    'Settlement Commission': ['settlement application', 'full disclosure', 'immunity petition', 'settlement order', 'undisclosed income', 'section 245C', 'commission hearing', 'settlement terms', 'case pending', 'finality of order'],

    # CRIMINAL MATTERS
    'Capital punishment': ['rarest of rare', 'death sentence confirmation', 'section 302 IPC', 'aggravating factors', 'mitigating circumstances', 'deterrent punishment', 'execution warrant', 'mercy petition', 'death row convict', 'commutation plea'],
    'Dowry death': ['section 304B', 'dowry demand', 'soon before death', 'cruelty evidence', 'stridhan recovery', 'domestic violence', 'marriage harassment', 'dowry prohibition', 'bride burning', 'death within 7 years'],
    'Prevention of Corruption Act': ['illegal gratification', 'trap case', 'disproportionate assets', 'section 13', 'public servant', 'vigilance inquiry', 'sanction prosecution', 'bribe money', 'demand proof', 'recovery certificate'],
    'NDPS Act': ['commercial quantity', 'psychotropic substance', 'section 37', 'conscious possession', 'sampling procedure', 'independent witness', 'mandatory minimum', 'contraband seizure', 'drug analysis', 'cartel involvement'],
    'Sexual harassment': ['section 354', 'modesty outrage', 'voyeurism', 'stalking', 'workplace harassment', 'POSH Act', 'zero FIR', 'acid attack', 'compensation award', 'victim protection'],

    # SERVICE MATTERS
    'Promotion': ['seniority cum merit', 'departmental promotion', 'reservation roster', 'sealed cover procedure', 'dpc meeting', 'eligible list', 'promotion criteria', 'benchmarking', 'notional promotion', 'promotion policy'],
    'Pension': ['pension revision', 'commutation amount', 'family pension', 'gratuity payment', 'pension sanction', 'qualifying service', 'invalid pension', 'delay in pension', 'pension cut', 'pensionary benefits'],
    'Disciplinary proceedings': ['charge memo', 'departmental inquiry', 'evidence act violation', 'punishment order', 'minor penalty', 'major penalty', 'cvc guidelines', 'inquiry officer', 'defense assistant', 'ex parte inquiry'],
    'Reservation in service': ['creamy layer reservation','reservation policy service','mandal commission', 'roster system', 'carry forward rule', 'promotion reservation', 'reservation ceiling', 'social backwardness', 'income criteria', 'caste certificate'],
    'Voluntary Retirement': ['vr scheme', 'acceptance condition', 'resignation difference', 'pension eligibility', 'notice period', 'vr benefits', 'forced retirement', 'section 18', 'voluntary separation', 'golden handshake'],

    # INDIRECT TAXES MATTERS
    'Interpretation of the Customs Act': ['customs valuation', 'bill of entry', 'section 14', 'import manifest', 'baggage rules', 'prohibited goods', 'customs duty exemption', 'project imports', 'redeployment certificate', 'customs appeal'],
    'Central Excise Act': ['excisable goods', 'cenvat credit', 'manufacturing process', 'rule 6', 'clearance certificate', 'duty demand notice', 'modvat', 'central excise tariff', 'factory gate', 'removal of goods'],
    'Service Tax': ['taxable service', 'reverse charge', 'mega exemption', 'place of provision', 'point of taxation', 'export of service', 'input service distributor', 'service tax audit', 'abatement claim', 'works contract'],
    'Anti Dumping Duty': ['margin of dumping', 'injury determination', 'designated authority', 'like article', 'normal value', 'landed value', 'provisional duty', 'sunset review', 'domestic industry', 'reference price'],

    # LAND ACQUISITION & REQUISITION
    'Compensation challenges': ['market value', 'section 23', 'solatium', 'additional compensation', 'potential value', 'expert valuation', 'land acquisition award', 'belting system', 'severance compensation', 'injurious affection'],
    'Defence acquisition': ['urgency clause', 'section 17 land acquisition','defence production', 'strategic purpose', 'emergency acquisition', 'national security', 'encroachment removal', 'defence installation', 'buffer zone', 'restricted area'],
    
    # ACADEMIC MATTERS
    'Examination matters': ['revaluation', 'grace marks', 'malpractice', 'answer sheet inspection', 'moderation policy', 'supplementary exam', 're-test order', 'paper leakage', 'unfair means', 'marking scheme'],
    'Educational management': ['deemed university', 'minority institution', 'mandatory disclosure', 'inspection report''fee regulation', 'affiliation withdrawal', 'teacher qualification', 'student union', 'anti-ragging', 'reservation policy education',],
    
    # LETTER PETITION & PIL MATTERS
    'Environmental PIL': ['carbon emissions', 'coastal regulation', 'forest clearance', 'wildlife protection', 'environment clearance', 'biosphere reserve', 'polluter pays', 'sustainable development', 'remediation plan', 'environmental audit'],
    'Human rights PIL': ['custodial death', 'prison reform', 'manual scavenging', 'child rights', 'rehabilitation plan', 'bonded labour', 'compensation scheme', 'victim protection', 'legal aid', 'juvenile justice'],

    # ELECTION MATTERS
    'Election petitions': ['corrupt practice', 'booth capturing', 'false affidavit', 'election expenses', 'nomination rejection', 'vote recount', 'election symbol', 'result declaration', 'electoral bonds', 'model code violation'],
    'MP/MLA disqualification': ['office of profit', 'defection law', 'anti-defection', 'resignation validity', 'membership cessation', 'disqualification petition', 'floor test', 'constitutional post', 'legislative privileges', 'breach of oath'],

    # COMPANY LAW, MRTP, SEBI
    'SEBI matters': ['insider trading', 'substantial acquisition', 'takeover code', 'disclosure norms', 'fraudulent trade', 'FII regulations', 'creamy layer investor','delisting shares', 'buyback offer', 'open offer'],
    'Competition Commission': ['anti-competitive', 'abuse of dominance', 'cartelization', 'combination regulation', 'leniency application', 'turnover penalty', 'market share', 'predatory pricing', 'vertical agreement', 'dominant position'],

    # ARBITRATION MATTERS
    'Arbitration challenges': ['arbitral award', 'section 34', 'patent illegality', 'public policy', 'arbitrator appointment', 'jurisdictional error', 'unilateral appointment', 'emergency arbitrator', 'arbitration clause', 'seat vs venue'],

    # COMPENSATION MATTERS
    'Railway accidents': ['untoward incident', 'running train', 'track negligence', 'railway liability', 'passenger ticket', 'level crossing', 'compensation tariff', 'railway tribunal', 'Fatal Accident Act', 'dependency claim'],
    'Telecom disputes': ['call drop', 'spectrum charges', 'interconnection fee', 'quality of service', 'tariff fixation', 'licence fee', 'port charges', 'access deficit', 'subscriber compensation', 'network failure'],

    # HABEAS CORPUS
    'Habeas Corpus': ['illegal detention', 'custody certificate', 'production order', 'missing person', 'wrongful confinement', 'custody jurisdiction', 'habeas corpus writ', 'preventive detention', 'custody transfer', 'detention validity'],

    # APPEAL AGAINST STATUTORY BODIES
    'Tribunal appeals': ['NCLT order', 'SAT decision', 'TDSAT ruling', 'appellate tribunal', 'technical member', 'jurisdictional error', 'perverse finding', 'substantial question', 'limitation period', 'pre-deposit'],

    # FAMILY LAW
    'Child custody': ['best interest', 'guardianship', 'Hague convention', 'parental alienation', 'access rights', 'child abduction', 'welfare principle', 'shared custody', 'visitation rights', 'custody jurisdiction'],
    'Muslim marriage': ['mehr', 'triple talaq', 'iddat period', 'muta marriage', 'nikahnama', 'dower debt', 'maintenance cap', 'khula', 'faskh', 'mahr dispute'],

    # CONTEMPT OF COURT
    'Civil contempt': ['order violation', 'undertaking breach', 'willful disobedience', 'compliance report', 'contempt notice', 'purge contempt', 'apology tendered', 'contempt jurisdiction', 'stay violation', 'court order'],

    # ORDINARY CIVIL MATTERS
    'Specific performance': ['readiness willingness', 'contract enforcement', 'alternative relief', 'section 20', 'discretionary relief', 'mutual consent', 'time essence', 'part performance', 'contract validity', 'title clearance'],
    'Electricity disputes': ['tariff order', 'cross subsidy', 'open access', 'wheeling charges', 'electricity theft', 'connection denial', 'meter tampering', 'regulatory asset', 'power purchase', 'renewable obligation'],

    # BENCH STRENGTH
    'Constitution Bench': ['basic structure doctrine', 'article 145(3) reference','constitutional amendment validity','federalism dispute resolution','judicial independence challenge','land acquisition constitutional validity','reservation constitutional limit','presidential reference article 143','kesavananda bharati ratio','constitutional morality violation'],
    # APPOINTMENTS
    'Judicial appointments': ['collegium system', 'memorandum procedure', 'seniority norm', 'elevation criteria', 'judicial independence', 'appointment delay', 'zone consideration', 'merit vs seniority', 'parent high court', 'additional judge'],

    # PERSONAL LAW
    'Inheritance disputes': ['coparcenary rights', 'ancestral property', 'testamentary succession', 'hindu succession', 'muslim inheritance', 'christian succession', 'parsi succession', 'legal heir', 'succession certificate', 'joint family'],

    # RELIGIOUS ENDOWMENTS
    'Temple management': ['mathadhipati', 'shebait rights', 'idol juristic', 'religious endowment', 'trustee removal', 'dharmakarta', 'archaka appointment', 'temple funds', 'secular management', 'religious practice'],

    # MERCANTILE LAWS
    'Banking disputes': ['sarfaesi', 'npa classification', 'debt recovery', 'wilful defaulter', 'guarantor liability', 'account fraud', 'cheque bounce', 'bank guarantee', 'loan recall', 'priority sector'],

    # JUDICIARY MATTERS
    'Judicial service': ['all india service', 'judicial academy', 'court infrastructure', 'judicial accountability', 'case management', 'judicial independence', 'court automation', 'judicial ethics', 'work allocation', 'judicial discipline'],

    # MEDICAL EDUCATION
    'NEET disputes': ['neet ug', 'all india quota', 'state reservation', 'marks normalization', 'omr challenge', 'percentile system', 'eligibility criteria', 'exam postponement', 'counselling process', 'nri quota'],

    # GOVERNMENT CONTRACTS
    'Tender disputes': ['bid eligibility', 'technical score', 'arbitrary rejection', 'pre-qualification', 'tender condition', 'commercial bid', 'L1 rejection', 'blacklisting order', 'bid security', 'tender cancellation'],

    # MINES & MINERALS
    'Mining leases': ['renewal rejection', 'mining plan', 'environment clearance', 'royalty payment', 'stamp duty', 'mining auction', 'captive mining', 'mineral rights', 'quarry lease', 'district mineral'],

    # CONSUMER PROTECTION
    'Deficiency of service': ['deficiency defined', 'medical negligence', 'housing delay', 'banking service', 'insurance claim', 'telecom service', 'professional service', 'deficiency compensation', 'unfair contract', 'product liability'],

    # ARMED FORCES
    'Military law': ['court martial', 'summary trial', 'command responsibility', 'military pension', 'service record', 'promotion policy', 'resettlement benefits', 'disability pension', 'martial law', 'army act'],

    # CONSTITUTIONAL MATTERS
    'Federal disputes': ['inter-state water', 'border dispute', 'language rights', 'state autonomy', 'governor powers', 'president rule', 'legislative competence', 'concurrent list', 'residuary powers', 'central ordinance']
}

def classify_legal_case(text):
    category_scores = {}

    for category, keywords in category_keywords.items():
        score = sum(text.count(kw) for kw in keywords)
        category_scores[category] = score

    max_score = max(category_scores.values())
    if max_score == 0:
        return 'Others'

    top_categories = [cat for cat, score in category_scores.items() if score == max_score]

    return top_categories[0] if len(top_categories) == 1 else ', '.join(top_categories)

df['Category_predicted'] = df['Case Content'].apply(classify_legal_case)

In [8]:
def word_count(text):
    if not isinstance(text, str) or text.strip() == '':
        return 0
    return len(text.split())

df['Word_Length'] = df['Case Content'].apply(word_count)

df.head(10)

Unnamed: 0,Case Title,Link,Case Content,Category_predicted,Word_Length
0,The State Of Tamil Nadu vs The Governor Of Tam...,https://indiankanoon.org/docfragment/82729634/...,take note read judgment using ourvirtual legal...,Federal disputes,37046
1,Independent Sugar Corporation Limited vs Giris...,https://indiankanoon.org/docfragment/117249167...,take note read judgment using ourvirtual legal...,Payment of Gratuity Act,14272
2,Piramal Capital And Housing Finance ... vs 63 ...,https://indiankanoon.org/docfragment/190999006...,take note read judgment using ourvirtual legal...,Enhancement of rent,11595
3,In Re Recruitment Of Visually Impaired ... vs ...,https://indiankanoon.org/docfragment/158218833...,take note read judgment using ourvirtual legal...,Arbitration challenges,9138
4,Union Of India vs Future Gaming Solutions P.Lt...,https://indiankanoon.org/docfragment/117744026...,take note read judgment using ourvirtual legal...,Enhancement of rent,8706
5,M/S Shri Sendhuragro And Oil Industries vs Kot...,https://indiankanoon.org/docfragment/93179288/...,take note read judgment using ourvirtual legal...,Payment of Gratuity Act,7006
6,Radhika Agarwal vs Union Of India on 27 Februa...,https://indiankanoon.org/docfragment/144838233...,take note read judgment using ourvirtual legal...,Contract Labour,7860
7,Yerikala Sunkalamma vs The State Of Andhra Pra...,https://indiankanoon.org/docfragment/184866874...,take note read judgment using ourvirtual legal...,Trade Unions Act,9069
8,Imran Pratapgadhi vs State Of Gujarat on 28 Ma...,https://indiankanoon.org/docfragment/118192392...,take note read judgment using ourvirtual legal...,Payment of Gratuity Act,4103
9,Vinubhai Mohanlal Dobaria vs Chief Commissione...,https://indiankanoon.org/docfragment/128219437...,take note read judgment using ourvirtual legal...,Prevention of Corruption Act,4707


In [11]:
df['Case Title'][8]

'Imran Pratapgadhi vs State Of Gujarat on 28 March, 2025'

In [10]:
df['Category_predicted'].value_counts()

Category_predicted
Payment of Gratuity Act                         124
Dismissal                                        70
Prevention of Corruption Act                     61
Arbitration challenges                           40
Enhancement of rent                              33
                                               ... 
Retrenchment, Income Tax Reference                1
Dismissal, NDPS Act                               1
Arbitration challenges, Civil contempt            1
Income Tax Reference, Arbitration challenges      1
Deficiency of service                             1
Name: count, Length: 125, dtype: int64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df['Case Content'].astype(str).tolist()

vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = vectorizer.fit_transform(corpus)

print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (710, 10000)


In [13]:
tfidf_matrix[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 10000))

In [14]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
from typing import List, Dict, Any

model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')  # Alternative for legal texts

def chunk_text(text: str, max_words: int = 400, overlap: int = 50) -> List[str]:
    
    if not text or text.strip() == "":
        return [""]
    
    words = text.split()
    
    if len(words) <= max_words:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(words):
        end = start + max_words
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += (max_words - overlap)
    
    return chunks


def generate_chunk_embeddings(text: str, model) -> np.ndarray:
    
    chunks = chunk_text(text)
    
    if len(chunks) == 1:
        return model.encode([text])[0]
    
    else:
        chunk_embeddings = model.encode(chunks)
        return np.mean(chunk_embeddings, axis=0)

In [16]:
embeddings_list = []
failed_indices = []

for i, case_text in enumerate(df['Case Content']):
    try:
        if i % 100 == 0:
            print(f"Processing case {i}/{len(df)}")
        
        embedding = generate_chunk_embeddings(str(case_text), model)
        embeddings_list.append(embedding)

    except Exception as e:
        print(f"Error processing case {i}: {e}")
        embeddings_list.append(np.zeros(model.get_sentence_embedding_dimension()))
        failed_indices.append(i)

embeddings = np.array(embeddings_list).astype('float32')

print(f"Generated embeddings for {len(embeddings)} cases")
print(f"Failed to process {len(failed_indices)} cases")

Processing case 0/710
Processing case 100/710
Processing case 200/710
Processing case 300/710
Processing case 400/710
Processing case 500/710
Processing case 600/710
Processing case 700/710
Generated embeddings for 710 cases
Failed to process 0 cases


In [19]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, 'legal_cases_index.faiss')

metadata = []
for i, row in df.iterrows():
    case_text = str(row['Case Content'])
    word_count = len(case_text.split())
    chunks = chunk_text(case_text)
    
    metadata.append({
        'Case Title': row['Case Title'],
        'Link': row['Link'],
        'Category_predicted': row['Category_predicted'],
        'Word_Count': word_count,
        'Chunk_Count': len(chunks),
        'Was_Chunked': len(chunks) > 1
    })

with open('case_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

np.save('legal_case_embeddings.npy', embeddings)

print("Embeddings generated and FAISS index created successfully!")
print(f"Index contains {index.ntotal} vectors")
print(f"Embedding dimension: {dimension}")

Embeddings generated and FAISS index created successfully!
Index contains 710 vectors
Embedding dimension: 384


In [None]:
def search_similar_cases(query, model, index, metadata, k=10, use_chunking=True):

    if use_chunking and len(query.split()) > 400:
        query_chunks = chunk_text(query)
        query_embeddings = model.encode(query_chunks)
        query_embedding = np.mean(query_embeddings, axis=0)
    else:
        query_embedding = model.encode([query])[0]
    
    query_embedding = np.array([query_embedding]).astype('float32')
    
    distances, indices = index.search(query_embedding, k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        similarity = 1 / (1 + distances[0][i])
        
        results.append({
            'rank': i + 1,
            'case_title': metadata[idx]['Case Title'],
            'link': metadata[idx]['Link'],
            'category': metadata[idx]['Category_predicted'],
            'word_count': metadata[idx]['Word_Count'],
            'was_chunked': metadata[idx]['Was_Chunked'],
            'similarity_score': float(similarity)
        })
    
    return results


# Example usage
similar_cases = search_similar_cases(
    "dismissal from service misconduct grounds for termination employment law", 
    model, 
    index, 
    metadata
)

In [21]:
similar_cases

[{'rank': 1,
  'case_title': 'Deepali Gundu Surwase vs Kranti Junior Adhyapak & Ors on 12 August, 2013',
  'link': 'https://indiankanoon.org/docfragment/81481647/?formInput=doctypes%3A%20supremecourt%20year%3A%202013',
  'category': 'Dismissal',
  'word_count': 3368,
  'was_chunked': True,
  'similarity_score': 0.5239004492759705},
 {'rank': 2,
  'case_title': 'Raghubir Singh vs Gen.Manager,Haryana Roadways,Hissar on 3 September, 2014',
  'link': 'https://indiankanoon.org/docfragment/96728565/?formInput=doctypes%3A%20supremecourt%20year%3A%202014',
  'category': 'Dismissal',
  'word_count': 2980,
  'was_chunked': True,
  'similarity_score': 0.5134267807006836},
 {'rank': 3,
  'case_title': 'Avtar Singh vs Union Of India & Ors on 21 July, 2016',
  'link': 'https://indiankanoon.org/docfragment/175903641/?formInput=doctypes%3A%20supremecourt%20year%3A%202016',
  'category': 'Dismissal',
  'word_count': 3661,
  'was_chunked': True,
  'similarity_score': 0.5068796873092651},
 {'rank': 4,
  