# Processing Issues using Gemma7b

In [1]:
import ollama
import json
import tiktoken
from llama_index.core.node_parser import TokenTextSplitter
from functools import partial

In [2]:
with open('data/prediction_data/issues.json') as f:
    issues_data = [json.loads(line) for line in f]
#issues_data

### Text Processor

In [3]:
chunk_size = 6000
separator = '\n'
backup_separators = [".", " "]
tokenizer = partial(tiktoken.get_encoding("cl100k_base").encode, allowed_special="all")

def txt_splitter(text):
    text_splitter = TokenTextSplitter(separator=separator,
                                        chunk_size=chunk_size,
                                        backup_separators=backup_separators,
                                        tokenizer=tokenizer)
    # print(len(tokenizer(text))))  ### Check token size 
    if len(tokenizer(text)) <= chunk_size:
        return [text]
    
    chunks = text_splitter.split_text(text)
    return chunks

### Prepare Issues

In [4]:
for issues in issues_data:
    issues['issues'] = txt_splitter(issues['issues'])

## LLM Magic

Prompt Templates

In [None]:
def generate_summary_template(context, prev_summary=''):

    prompt_template = f"""
    You are a Singapore Lawyer. \n
    summarise the main issues of the legal judgment below in 1500 words:\n
    {prev_summary}\n
    {context}.
    """
    return prompt_template

def generate_final_template(context):
    prompt_template = f"""
    {context}.\n
    summarise the legal issues above into sentences separated by full-stop. DO NOT give any headers.

    EXAMPLE: (if there are 3 issues)
    Issue 1. Issue 2. Issue 3.
    """
    return prompt_template

: 

Start

In [None]:
json_path = 'processed_issues.json'

: 

In [None]:
checkpoint = 0
for issues in issues_data:
    if len(issues) == 3:
        continue
    
    prev_summary = ''
    checkpoint += 1
    for chunk in issues['issues']:
        context = generate_summary_template(chunk, prev_summary)
        response = ollama.generate(model='gemma', prompt=str(context), stream=False)
        prev_summary = response['response']
    
    print(prev_summary)
    overall_summary = generate_final_template(prev_summary)
    response = ollama.generate(model='gemma', prompt=str(overall_summary), stream=False)
    final_summary = response['response']
    
    issues['summarised issues'] = final_summary.split('.')

    
    if checkpoint == 500:
        with open(json_path, 'w') as jf:
            json.dump(issues_data, jf, indent=4)
        checkpoint = 0

: 

: 

In [None]:
for issues in issues_data:
    print(issues['summarised issues'])

NameError: name 'issues_data' is not defined