In [1]:
import tiktoken
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from transformers import pipeline

from summary import *


tqdm.pandas()

In [3]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

model_ids = [
    'sshleifer/distilbart-cnn-12-6',                # bad with punctuation
    # 'marianna13/flan-t5-base-summarization',        # generally good, but breaks off at the end
    'Falconsai/text_summarization',                 # not bad, lack of punctuation
    'slauw87/bart_summarisation'                    # a lot of misinterpreting, but well structured and concise
]

In [2]:
path = Path('../parsing_easylaw_ai/data')

processed_cases_path = path / 'cases_processed.csv'


cases = pd.read_csv(path / 'cases_base.csv', sep='\t', encoding='utf-8', chunksize=30)

In [None]:
if processed_cases_path.exists():
    processed_cases_no = pd.read_csv(
        processed_cases_path, 
        sep='\t', 
        encoding='utf-8', 
        usecols=['easylaw_case_no']
        )


for df in cases:
    df.columns = list(map(lambda x: x.lower().replace(' ', '_').strip('.'), df.columns))
    df['tokens_of_judgment'] = df['judgment'].apply(lambda x: len(enc.encode(x)))


    if processed_cases_path in globals():
        df = df[~df['easylaw_case_no'].isin(processed_cases_no['easylaw_case_no'])]

        if not df.shape[0]:
            continue


    for model_id in model_ids:
        summarizer = pipeline("summarization", model=model_id)

        df[f'summarized_{model_id.split("/")[0]}'] = df['judgment']\
            .progress_apply(lambda x: recursion_summarizing(
                case=x, 
                model=summarizer, 
                chunk_size=512-10, 
                min_length=30, 
                max_length=150
                ))

        del summarizer


    df.to_csv(
        processed_cases_path, 
        mode='a', 
        sep='\t', 
        index=False, 
        header=not processed_cases_path.exists(), 
        encoding='utf-8'
                )

In [8]:
df = next(cases)

df.columns = list(map(lambda x: x.lower().replace(' ', '_').strip('.'), df.columns))
df['tokens_of_judgment'] = df['judgment'].apply(lambda x: len(enc.encode(x)))



# https://huggingface.co/docs/transformers/model_doc/pegasus
from transformers import AutoTokenizer, PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

ARTICLE_TO_SUMMARIZE = df.loc[2, 'judgment']
inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt", truncation=True)

# Generate Summary
summary_ids = model.generate(inputs["input_ids"])
output = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

: 