In [46]:
import os
import tiktoken
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from transformers import pipeline

from summary import *


tqdm.pandas()


path = Path('../parsing_easylaw_ai/data')

processed_cases_path = path / 'cases_processed.csv'

In [48]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

model_ids = [
    'sshleifer/distilbart-cnn-12-6',                # bad with punctuation
    # 'marianna13/flan-t5-base-summarization',        # generally good, but breaks off at the end
    'Falconsai/text_summarization',                 # not bad, lack of punctuation
    'slauw87/bart_summarisation'                    # a lot of misinterpreting, but well structured and concise
]

In [49]:
cases = pd.read_csv(path / 'cases_base.csv', sep='\t', encoding='utf-8', chunksize=1000)

In [50]:
df = next(cases)

df.columns = list(map(lambda x: x.lower().replace(' ', '_'), df.columns))
df['tokens_of_judgment'] = df['judgment'].apply(lambda x: len(enc.encode(x)))

In [1]:
for df in cases:
    df.columns = list(map(lambda x: x.lower().replace(' ', '_'), df.columns))
    df['tokens_of_judgment'] = df['judgment'].apply(lambda x: len(enc.encode(x)))

    for model_id in model_ids:
        summarizer = pipeline("summarization", model=model_id)

        df[f'summarized_{model_id.split("/")[0]}'] = df['judgment']\
            .progress_apply(lambda x: 
                recursion_summarizing(case=x, model=summarizer, chunk_size=512-10, min_length=30, max_length=150)
                )

        del summarizer

        df.to_csv(
            processed_cases_path, 
            mode='a', 
            sep='\t', 
            index=False, 
            header=not processed_cases_path.exists(), 
            encoding='utf-8'
                    )

In [3]:
pd.read_csv(path / 'cases_processed.csv', sep='\t', encoding='utf-8')