In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import pandas as pd
import re
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
columns_to_read=['DocID','Case_Name', 'Judgment_Date','Author','Bench', 'CaseID/CitationID','Verdict']
data = pd.read_csv("/content/drive/My Drive/legal_data.csv",encoding='latin-1', usecols=columns_to_read)

data.head()

In [None]:
# Code to get Statutes
import math

# Regular Expressions
pattern_s = r'\b[Ss]\. (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_ss = r'\b[Ss]s\. (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_section = r'\b[Ss]ection (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_sections =r'\bsections (\d+[A-Za-z]*(?:\s*,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_subsection = r'\b[Ss]ubsection (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_r = r'\b[Rr]\. (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

pattern_rule = r'\b[Rr]ule (\d+[A-Za-z]*(?:,\s*\d+[A-Za-z]*)*)(?:\s+\w+)*\s+(IPC|Indian Penal Code|CRPC|Criminal Procedure Code|Code of Criminal Procedure|CPC|Code of Civil Procedure|CrPC)'

# Extract section numbers and types for each row in the DataFrame
for index, row in data.iterrows():
    ipc_sections = set()
    crpc_sections = set()
    cpc_sections = set()
    if math.isnan(row['DocID']):
      continue
    caseid = int(row['DocID'])
    try:
      matches_statutes = list()
      extracted_sections = list()
      with open(f"/content/drive/My Drive/text_files/{caseid}.txt", 'r') as file:
        content = str(file.read())
        matches_s = re.findall(pattern_s, content)
        matches_statutes.append(matches_s)

        matches_ss = re.findall(pattern_ss, content)
        matches_statutes.append(matches_ss)

        matches_section = re.findall(pattern_section, content)
        matches_statutes.append(matches_section)

        matches_sections = re.findall(pattern_sections, content)
        matches_statutes.append(matches_sections)

        matches_subsection = re.findall(pattern_subsection, content)
        matches_statutes.append(matches_subsection)

        matches_r = re.findall(pattern_r, content)
        matches_statutes.append(matches_r)

        matches_rule = re.findall(pattern_rule, content)
        matches_statutes.append(matches_rule)
        matches_statutes = [item for sublist in matches_statutes for item in sublist]

        for match in matches_statutes:
            if len(match) == 0:
              continue
            section_number = match[0]
            section_type = match[1].strip().lower()
            if section_type in ['ipc', 'indian penal code']:
                ipc_sections.add(section_number)
            elif section_type in ['crpc','criminal procedure code','code of criminal procedure']:
                crpc_sections.add(section_number)
            elif section_type in ['cpc','code of civil procedure']:
                cpc_sections.add(section_number)
            extracted_sections.append((section_type, section_number))
        print(f"Extracted Sections for Case {caseid}: {extracted_sections}")

    except FileNotFoundError:
      print("File not found")

    data.at[index,'IPC'] = str(list(ipc_sections))
    data.at[index,'CRPC'] = str(list(crpc_sections))
    data.at[index,'CPC'] = str(list(cpc_sections))


In [None]:
# Code to get Acts
import math
# Read the contents of the text file
with open('/content/drive/My Drive/current_acts.txt', 'r') as file:
    txt_contents = file.read().splitlines()

# Compile regex patterns for efficient matching
search_patterns = [re.compile(re.escape(line), re.IGNORECASE) for line in txt_contents]

for index, row in data.iterrows():
  if math.isnan(row['DocID']):
      continue
  caseid = int(row['DocID'])
  acts = []
  # # Create a set to store unique matched parts
  unique_matched_parts = set()
  try:
    columns_read=['Doc_ID','Sentence','Category']
    df = pd.read_csv(f"/content/drive/My Drive/Label_2RR/{caseid}.csv",encoding='latin-1', usecols=columns_read)
    df = df.dropna(how='all')
    for idx, inner_row in df.iterrows():
      sentence = str(inner_row['Sentence'])
      #For extracting acts
      if isinstance(sentence, str):
         # Iterate through the search patterns and find matching parts
         for pattern in search_patterns:
           matches = pattern.findall(sentence)
           unique_matched_parts.update(matches)

      # Print the unique matched parts for the current file with the corresponding Doc_ID
    if unique_matched_parts:
      for matched_part in unique_matched_parts:
        acts.append(matched_part)
      print(caseid)
      print(f"{acts}")

    data.at[index, 'Acts'] = str(acts)


  except FileNotFoundError:
    print("File not found")


In [None]:
# Code to Append Acts which we extract from CITES in HTML Docs

import math
import ast
from bs4 import BeautifulSoup

for index, row in data.iterrows():
  if math.isnan(row['DocID']):
        continue
  case_id = int(row['DocID'])
  print(case_id)
  # Extract required information from <a> tags
  try:
      with open(f"/content/drive/My Drive/input dataset/Cases/{case_id}.html", 'r') as file:
        content = str(file.read())
        if 'Cites' in content and 'Citedby' in content:
          parts1 = content.split('Cites', 1)
          parts2 = parts1[1].split('Citedby', 1)
          # Get the second part
          result = parts2[0].strip()

        elif 'Cites' in content:
          parts1 = content.split('Cites', 1)
          result = parts1[1]

        else:
          continue

        # Parse HTML using Beautiful Soup
        soup = BeautifulSoup(result, 'html.parser')

        # Find all div elements with class 'cite_title'
        cite_titles = soup.find_all('div', class_='cite_title')

        # Extract text inside the 'a' tag for each 'cite_title' div
        titles = [title.find('a').get_text(strip=True) for title in cite_titles]
        if pd.isna(row['Acts']):
          data.at[index,'Acts'] = str(titles)
        else:
          act_list = ast.literal_eval(row['Acts'])
          for title in titles:
            if 'vs' in title or 'v.' in title or 'V.' in title or "VS" in title:
              continue
            act_list.append(title)

          data.at[index,'Acts'] = str(act_list)

  except FileNotFoundError:
      print("File not found")

In [None]:
import math

Doc_ids = list()

for index, row in data.iterrows():
  if(math.isnan(row['DocID'])):
    continue
  Doc_ids.append(str(int(row['DocID'])))

In [None]:
# Code to get Precedent and their DocIDs

from bs4 import BeautifulSoup
import math

data['PrecedentCaseID'] = None
data['Precedent'] = None

for index, row in data.iterrows():
  if math.isnan(row['DocID']):
        continue
  case_id = int(row['DocID'])
  print(case_id)
  # Extract required information from <a> tags
  results = []
  unique_links = set()
  check = True
  try:
    with open(f"/content/drive/My Drive/input dataset/Cases/{case_id}.html", 'r') as file:
      content = str(file.read())
      if 'BENCH' in content:
        # Split the string based on the word 'JUDGMENT'
        parts1 = content.split('BENCH', 1)
        parts2 = parts1[1].split('JUDGMENT', 1)
        # Get the second part
        content = parts2[1].strip()

      elif 'Bench' in content:
        # Split the string based on the word 'JUDGMENT'
        parts = content.split('Bench', 1)
        # Get the second part
        content = parts[1].strip()

      else:
        # Split the string based on the word 'JUDGMENT'
        parts = content.split('JUDGMENT', 1)
        # Get the second part
        content = parts[1].strip()

      # Parse the HTML content using BeautifulSoup
      soup = BeautifulSoup(content, 'html.parser')

      # Find all <a> tags with specific attributes
      anchor_tags = soup.find_all('a', {'href': True})

      for tag in anchor_tags:
          link_href = tag['href']
          link_text = tag.text.strip()
          link_seperated = link_href.split('/')
          link_href = link_seperated[-2]
          if 'vs' in link_text or 'v.' in link_text:
            if link_href in Doc_ids and link_href not in unique_links:
              unique_links.add(link_href)
              results.append((link_text, link_href))

      # Print the extracted information
      for result in results:
        if check:
          check = False
          data.at[index, 'Precedent'] = result[0]
          data.at[index, 'PrecedentCaseID'] = result[1]
        else:
          new_row = row.copy()
          new_row['Precedent'] = result[0]
          new_row['PrecedentCaseID'] = result[1]
          # Append the new row to the DataFrame
          data = data.append(new_row, ignore_index=True)

  except FileNotFoundError:
    print("File not found")

In [None]:
import math
docid_index_map = {}

# Iterate through the DataFrame using iterrows
for index, row in data.iterrows():
    # Assuming 'DocID' is the column containing the DocID values
    if math.isnan(row['DocID']):
      continue
    docid = int(row['DocID'])

    # Store the index in the hashmap
    docid_index_map[docid] = index

In [None]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def get_character_ngrams(text, n):
    tokens = word_tokenize(text.lower())
    character_ngrams = set()
    for token in tokens:
        ngram_list = list(ngrams(token, n))
        character_ngrams.update(["".join(ngram) for ngram in ngram_list])
    return character_ngrams

def jaccard_similarity(str1, str2, n=2):
    ngrams1 = get_character_ngrams(str1, n)
    ngrams2 = get_character_ngrams(str2, n)
    intersection = len(ngrams1.intersection(ngrams2))
    union = len(ngrams1) + len(ngrams2) - intersection
    similarity_score = intersection / union
    return similarity_score

In [None]:
# Code to get common acts using HTML files
import math
import ast

data['CommonActs'] = None

for index, row in data.iterrows():
  if math.isnan(row['DocID']):
    continue
  common_acts = set()
  if row['PrecedentCaseID'] is None:
    data.at[index, 'CommonActs'] = str(list(common_acts))
    continue
  if math.isnan(row['PrecedentCaseID']):
    data.at[index, 'CommonActs'] = str(list(common_acts))
    continue
  print(int(row['DocID']))
  current_row_acts = ast.literal_eval(row['Acts'])
  precedent_row_acts_data = data.at[docid_index_map[int(row['PrecedentCaseID'])],'Acts']
  if pd.isna(precedent_row_acts_data):
    print(row['DocID'], " iska skip hogya as nan acts in precedent")
    data.at[index, 'CommonActs'] = str(list(common_acts))
    continue
  precendent_row_acts = ast.literal_eval(precedent_row_acts_data)

  for str1 in current_row_acts:
    for str2 in precendent_row_acts:
        similarity = jaccard_similarity(str1, str2)
        if similarity >= 0.65:
          if len(str1) > len(str2):
            common_acts.add(str1)
          else:
            common_acts.add(str2)

  data.at[index, 'CommonActs'] = str(list(common_acts))

In [None]:
# Code to create new columns 'CommonIPC', 'CommonCPC', and 'CommonCRPC'
import math

data['CommonIPC'] = None
data['CommonCRPC'] = None
data['CommonCPC'] = None

for index, row in data.iterrows():
  if math.isnan(row['DocID']):
    continue
  common_ipc = list()
  common_crpc = list()
  common_cpc = list()
  if math.isnan(row['PrecedentCaseID']):
    data.at[index, 'CommonIPC'] = str(common_ipc)
    data.at[index, 'CommonCRPC'] = str(common_crpc)
    data.at[index, 'CommonCPC'] = str(common_cpc)
    continue

  print(int(row['DocID']))
  current_ipc = set(str(data.at[index, 'IPC']).replace('[','').replace(']','').replace('\'','').split(','))
  current_crpc = set(str(data.at[index, 'CRPC']).replace('[','').replace(']','').replace('\'','').split(','))
  current_cpc = set(str(data.at[index, 'CPC']).replace('[','').replace(']','').replace('\'','').split(','))

  precedent_row_ipc_data = data.at[docid_index_map[int(row['PrecedentCaseID'])],'IPC']
  precedent_row_crpc_data = data.at[docid_index_map[int(row['PrecedentCaseID'])],'CRPC']
  precedent_row_cpc_data = data.at[docid_index_map[int(row['PrecedentCaseID'])],'CPC']

  if pd.isna(precedent_row_ipc_data):
    print(row['DocID'])
    data.at[index, 'CommonIPC'] = str((common_ipc))

  if pd.isna(precedent_row_crpc_data):
    print(row['DocID'])
    data.at[index, 'CommonCRPC'] = str((common_crpc))

  if pd.isna(precedent_row_cpc_data):
    print(row['DocID'])
    data.at[index, 'CommonCPC'] = str((common_cpc))


  precedent_ipc = set(str(precedent_row_ipc_data).replace('[','').replace(']','').replace('\'','').split(','))
  precedent_crpc = set(str(precedent_row_crpc_data).replace('[','').replace(']','').replace('\'','').split(','))
  precedent_cpc = set(str(precedent_row_cpc_data).replace('[','').replace(']','').replace('\'','').split(','))


  # Find common values for IPC, CPC, and CRPC
  if data.at[index, 'CommonIPC'] != '[]':
    common_ipc = list(precedent_ipc.intersection(current_ipc))
    data.at[index, 'CommonIPC'] = common_ipc

  if data.at[index, 'CommonCRPC'] != '[]':
    common_crpc = list(precedent_crpc.intersection(current_crpc))
    data.at[index, 'CommonCRPC'] = common_crpc

  if data.at[index, 'CommonCPC'] != '[]':
    common_cpc = list(precedent_cpc.intersection(current_cpc))
    data.at[index, 'CommonCPC'] = common_cpc
