In [69]:
import re

def split_into_sections(text):
  """
  Splits the given text into a list of sections, handling subsections and including all preceding text.

  Args:
      text: The input text as a string.

  Returns:
      A list of strings where each element represents a section from the input text.
  """
  sections = []
  current_section = ""
  current_section_number = 0
  in_section = False  # Flag to track if currently processing a section

  # Pattern to match a section number followed by a period and space
  pattern = r"(?m)^\d+\. +"

  for line in text.splitlines():
    if not line.strip():  # Check for empty line
      continue  # Skip empty lines

    # Check if line starts with a number and period (potential section)
    match = re.match(pattern, line)

    if match:
      # Extract section number
      new_section_number = int(match.group(0).split(".")[0])
      in_section = True  # Start processing a new section

      if new_section_number > current_section_number:  # New section
        if current_section:
          sections.append(current_section)
        current_section = line
        current_section_number = new_section_number
      else:
        # Sub-section within current section, append with formatting
        current_section += f"\n {line.split(' ', 1)[1]}"
    else:
      # Line within existing section or text before sections
      if not in_section:
        # Text before sections, consider it Section 0
        current_section += line
      else:
        # Line within existing section, append with newline
        current_section += "\n" + line

  if current_section:
    sections.append(current_section)

  return sections

# Example usage

sections = split_into_sections(text)

# Print all sections
for section in sections:
  print(section)


2.  The present appeal is directed against the judgment and order
dated 24th April, 2019 passed by the National Consumer Disputes
Redressal Commission, New Delhi (hereinafter referred to as the
NCDRC) in Revision Petition No. 897 of 2018, whereby the NCDRC
while allowing the said Revision Petition, has set aside the order
passed by the State Commission and has confirmed the order
passed by the District Forum.
Dr. 3.  The short facts giving rise to the present appeal are that Mr.
Pradeep Kumar, the husband of the respondent herein (original
complainant) had taken/purchased a life insurance policy under the
Jeevan Suraksha Yojana on 14.04.2021 from the appellant-Life
Insurance Corporation, under which a sum of Rs. 3,75,000/- was
assured by the corporation, and in case of death by accident an
additional sum of Rs. 3,75,000/- was also assured. The insurance
premium of the said policy was to be paid six monthly. The next
premium was due to be paid by the said insured Pradeep Kumar on
14th O

In [1]:
from pathlib import Path
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
def get_data_filepath(dataset, phase, type, i=None):
    suffix = ''
    if i is not None:
        suffix = f'.{i}'
    #filename = f'{dataset}.{phase}'
    #return DATASETS_DIR / dataset / filename
    current_dir = os.getcwd()
    file_name = f'{dataset},{phase}'
    return f'{current_dir}/{file_name}'

In [None]:
class TrainDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=256, sample_size=1):
        self.sample_size = sample_size
        print("init TrainDataset ...")
        self.dataset_filepath = get_data_filepath(dataset,'train')
        self.dataset_sub_folders = os.listdir(self.dataset_filepath)
        print("Initialized dataset done.....")
        # preprocessor = load_preprocessor()
        # self.source_filepath = preprocessor.get_preprocessed_filepath(dataset, 'train', 'complex')
        # self.target_filepath = preprocessor.get_preprocessed_filepath(dataset, 'train', 'simple')

        self.max_len = max_len
        self.tokenizer = tokenizer

        self._load_data()

    def _load_data(self):
        self.inputs = []
        self.targets = []
        for sub_folder in self.dataset_sub_folders:
            file_path_src = os.path.join(self.dataset_filepath, sub_folder, 'Judgment_Text.txt')
            file_path_target = os.path.join(self.dataset_filepath, sub_folder, 'English_Summary.txt')
            self.inputs.append(read_lines(file_path_src))
            self.targets.append(read_lines(file_path_target))

    def __len__(self):
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        source = self.inputs[index]
        # source = "summarize: " + self.inputs[index]
        target = self.targets[index]

        tokenized_inputs = self.tokenizer(
            [source],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )
        
        tokenized_targets = self.tokenizer(
            [target],
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors="pt"
        )
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()

        src_mask = tokenized_inputs["attention_mask"].squeeze()  # might need to squeeze
        target_mask = tokenized_targets["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,
                'sources': self.inputs[index], 'targets': [self.targets[index]],
                'source': source, 'target': target}
    


In [None]:
def read_lines(filepath):
    return yield_lines(filepath)


def yield_lines(filepath):
    filepath = Path(filepath)
    encoding='utf-8'
    with open(filepath, encoding=encoding) as f:
        file_contents = f.read()
    return file_contents

In [None]:
class ValDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=256, sample_size=1):
        self.sample_size = sample_size
        ### WIKI-large dataset ###
        self.dataset_filepath = get_data_filepath(dataset,'val')
        self.dataset_sub_folders = os.listdir(self.dataset_filepath)
        print(self.dataset_filepath)

        self.max_len = max_len
        self.tokenizer = tokenizer

        self._build()

    def __len__(self):
        return int(len(self.inputs) * self.sample_size)

    def __getitem__(self, index):
        return {"source": self.inputs[index], "targets": self.targets[index]}

    def _build(self):
        self.inputs = []
        self.targets = []
        for sub_folder in self.dataset_sub_folders:
            file_path_src = os.path.join(self.dataset_filepath, sub_folder, 'Judgment_Text.txt')
            file_path_target = os.path.join(self.dataset_filepath, sub_folder, 'English_Summary.txt')
            self.inputs.append(read_lines(file_path_src))
            self.targets.append(read_lines(file_path_target))

In [None]:
def evaluate_on_WIKIDOC(phase, features_kwargs=None,  model_dirname = None):
    dataset = WIKI_DOC
    model_dir = EXP_DIR / model_dirname
    output_dir = model_dir / 'outputs'

    output_dir.mkdir(parents = True, exist_ok = True)
    #features_hash = generate_hash(features_kwargs)
    output_score_filepath = output_dir / f'score_{dataset}_{phase}.log.txt'
    complex_filepath =get_data_filepath(dataset, phase, 'complex')# _kw_num3_div0.9'
    
    if not output_score_filepath.exists() or count_line(output_score_filepath)==0:
        start_time = time.time()
        complex_filepath =get_data_filepath(dataset, phase, 'complex') #
        
        #complex_filepath = get_data_filepath(dataset, phase, 'complex_summary_'+str(ratio))
        pred_filepath = output_dir / f'{complex_filepath.stem}.txt'
        ref_filepaths = get_data_filepath(dataset, phase, 'simple') #
        # print(complex_filepath)
        # print(pred_filepath)
        # print(ref_filepaths)
        if pred_filepath.exists() and count_line(pred_filepath)==count_line(complex_filepath):
            print("File is already processed.")
        else:
            simplify_file(complex_filepath, pred_filepath, features_kwargs, model_dirname)

        print("Evaluate: ", pred_filepath)

        with log_stdout(output_score_filepath):
            scores  = evaluate_system_output(test_set='custom',
                                             sys_sents_path=str(pred_filepath),
                                             orig_sents_path=str(complex_filepath),
                                             refs_sents_paths=str(ref_filepaths))


            print("SARI: {:.2f}\t D-SARI: {:.2f} \t BLEU: {:.2f} \t FKGL: {:.2f} ".format(scores['sari'], scores['D-sari'], scores['bleu'], scores['fkgl']))
            # print("{:.2f} \t {:.2f} \t {:.2f} ".format(scores['SARI'], scores['BLEU'], scores['FKGL']))

            print("Execution time: --- %s seconds ---" % (time.time() - start_time))
            return scores['sari']
    else:
        print("Already exists: ", output_score_filepath)
        print("".join(read_lines(output_score_filepath)))

In [1]:
from Bart_baseline_finetuned import BartBaseLineFineTuned
from contextlib import contextmanager
# import json
# from preprocessor import Preprocessor
import torch
# from transformers import T5ForConditionalGeneration, T5TokenizerFast
from preprocessor import get_data_filepath, EXP_DIR,  REPO_DIR, WIKI_DOC, D_WIKI, MILDSUM
from preprocessor import write_lines, yield_lines, count_line, read_lines, generate_hash
from easse.sari import corpus_sari
import time
from utils.D_SARI import D_SARIsent

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/ai22mtech11004/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/ai22mtech11004/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
model_dirname = "exp_1713339227507721"
checkpoint_path = 'checkpoint-epoch=6.ckpt'

In [3]:
!export CUDA_VISIBLE_DEVIES=1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model = BartBaseLineFineTuned.load_from_checkpoint(EXP_DIR / model_dirname / checkpoint_path).to(device)
model = Model.model.to(device)
tokenizer = Model.tokenizer

In [4]:
tokenizer.max_len_single_sentence

1022

In [5]:
def generate_single(sentence, preprocessor = None):
    '''
    This function is for T5 or Bart single model to generate/predict
    '''

    # text = "simplify: " + sentence  ### -> for T5
    text = sentence
    encoding = tokenizer(text, max_length=512,
                                     padding='max_length',
                                     truncation=True,
                                     return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    beam_outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        do_sample=True,
        max_length=577,
        num_beams=2,
        top_k=70,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=1,
    )
    sent = tokenizer.decode(beam_outputs[0].tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return sent


In [63]:
with open("./data/MILDSum/test/2793/Judgment_Text.txt") as f:
    text = f.read()

In [65]:
import re

def split_into_sections(text):
  """
  Splits the given text into a list of sections, handling subsections and including all preceding text.

  Args:
      text: The input text as a string.

  Returns:
      A list of strings where each element represents a section from the input text.
  """
  sections = []
  current_section = ""
  current_section_number = 0
  in_section = False  # Flag to track if currently processing a section

  # Pattern to match a section number followed by a period and space
  pattern = r"(?m)^\d+\. +"

  for line in text.splitlines():
    if not line.strip():  # Check for empty line
      continue  # Skip empty lines

    # Check if line starts with a number and period (potential section)
    match = re.match(pattern, line)

    if match:
      # Extract section number
      new_section_number = int(match.group(0).split(".")[0])
      in_section = True  # Start processing a new section

      if new_section_number > current_section_number:  # New section
        if current_section:
          sections.append(current_section)
        current_section = line
        current_section_number = new_section_number
      else:
        # Sub-section within current section, append with formatting
        current_section += f"\n {line.split(' ', 1)[1]}"
    else:
      # Line within existing section or text before sections
      if not in_section:
        # Text before sections, consider it Section 0
        current_section += line
      else:
        # Line within existing section, append with newline
        current_section += "\n" + line

  if current_section:
    sections.append(current_section)

  return sections

# Example usage

sections = split_into_sections(text)

# Print all sections
for section in sections:
  print(section)


2.  The present appeal is directed against the judgment and order
dated 24th April, 2019 passed by the National Consumer Disputes
Redressal Commission, New Delhi (hereinafter referred to as the
NCDRC) in Revision Petition No. 897 of 2018, whereby the NCDRC
while allowing the said Revision Petition, has set aside the order
passed by the State Commission and has confirmed the order
passed by the District Forum.
Dr. 3.  The short facts giving rise to the present appeal are that Mr.
Pradeep Kumar, the husband of the respondent herein (original
complainant) had taken/purchased a life insurance policy under the
Jeevan Suraksha Yojana on 14.04.2021 from the appellant-Life
Insurance Corporation, under which a sum of Rs. 3,75,000/- was
assured by the corporation, and in case of death by accident an
additional sum of Rs. 3,75,000/- was also assured. The insurance
premium of the said policy was to be paid six monthly. The next
premium was due to be paid by the said insured Pradeep Kumar on
14th O

In [66]:
sections[7]

'18.        The endeavour of the court must always be\nto interpret the words in which the contract is\nexpressed by the parties. The court while construing\nthe terms of policy is not expected to venture into\nextra liberalism that may result in rewriting the\ncontract of substituting the terms which were not\nintended by the parties. The insured cannot claim\nanything more than what is covered by the insurance\npolicy. (General Assurance Society Ltd. v.\nInsurance Co. Ltd. v. Sony Cheriyan AIR 1999\nSC 3252 and United India Insurance Co. Ltd. v.\n  From the afore-stated legal position, it is clear that the terms of\ninsurance policy have to be strictly construed, and it is not\npermissible to rewrite the contract while interpreting the terms of\nthe Policy. In the instant case, condition no. 11 of the Policy clearly\nstipulated that the policy has to be in force when the accident takes\nplace. In the instant case, the policy had lapsed on 14.10.2011 and\nwas not in force on the date 

In [None]:
import nltk
nltk.download('punkt')
with open("./data/MILDSum/test/18/Judgment_Text.txt") as f:
    sentences = nltk.tokenize.sent_tokenize(f.read())

In [67]:
with open("./data/MILDSum/test/18/Judgment_Text.txt") as f:
    file_content = f.read()

In [68]:
len(file_content)

16223

In [None]:
#tokenizer = already defined 
#model = already defined

In [70]:
sections = split_into_sections(file_content)

In [71]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(section)) for section in sections])

659

## Create the chuncks:

In [74]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for section in sections:
  count += 1
  combined_length = len(tokenizer.tokenize(section)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= (tokenizer.max_len_single_sentence/1.5): # if it doesn't exceed
    chunk += section + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sections) - 1:
      chunks.append(chunk.strip()) # save the chunk
    
  else: 
    chunks.append(chunk.strip()) # save the chunk
    
    # reset 
    length = 0 
    chunk = ""

    # take care of the overflow section
    chunk += section + " "
    length = len(tokenizer.tokenize(section))
len(chunks)

9

In [75]:
[len(tokenizer.tokenize(c)) for c in chunks]

[625, 159, 659, 635, 598, 662, 618, 665, 102]

In [76]:
[len(tokenizer(c).input_ids) for c in chunks]

[627, 161, 661, 637, 600, 664, 620, 667, 104]

In [77]:
sum([len(tokenizer(c).input_ids) for c in chunks])

4741

In [79]:
len(tokenizer(file_content).input_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (4766 > 1024). Running this sequence through the model will result in indexing errors


4766