## Imports

In [15]:
import tiktoken
import torch
import spacy
import pytextrank
import re
import pandas as pd
import numpy as np
from summarizer import Summarizer
import ollama
import csv

MAX_TOKENS = 600

## Preprocessing

In [None]:
facts = pd.read_csv('/Users/XXXX-1/XXXX-3/ECHR/data/facts_unclean.csv', header = 0, index_col = 0)
facts = facts.reset_index(drop = True)
facts

In [17]:
types = facts['facts'].apply(type).unique()
types

array([<class 'str'>, <class 'float'>], dtype=object)

In [18]:
nan_rows = facts.isna().any(axis=1)
nan_indices = facts.index[nan_rows].tolist()
nan_indices

[212, 266, 409, 767]

In [19]:
facts = facts.dropna()


In [None]:
def clean_text(text):
    # Remove markdown elements and special characters
    text = re.sub(r'#', '', text)  # Remove '###'
    text = re.sub(r'[-]', '', text)  # Remove '-'
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\\[a-zA-Z0-9]+', '', text)  # Remove any escaped sequences like \xa0
    return text

cleaned_facts = [clean_text(s) for s in facts['facts']]
cleaned_facts

In [14]:
with open('facts_clean_final.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    for fact in cleaned_facts:
        writer.writerow([fact])


In [21]:
df = pd.DataFrame(cleaned_facts, columns=["facts"])
csv_file_path = "facts_clean_final.csv"
df.to_csv(csv_file_path, index=False)


## Textrank

In [14]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank", last = True)
nlp.max_length = 3_000_000

In [30]:
cleaned_facts[0]

' THE FACTS 2. The applicant was born in 1971 and lives in Blagoevgrad. He was represented by Mr M. Ekimdzhiev and Ms K. Boncheva, lawyers practising in Plovdiv. 3. The Government were represented by their Agent, Ms I. StanchevaChinova of the Ministry of Justice. Background to the case Background to the case 4. Following parliamentary elections on 12 May 2013, on 29 May 2013 a new government was formed, led by Mr Plamen Oresharski and supported in Parliament by the Coalition for Bulgaria (whose main member was the Bulgarian Socialist Party – see paragraph 8 below), which had eightyfour members of Parliament, the Movement for Rights and Freedoms, which had thirtysix members of Parliament, and the chairman of another political party, Ataka, Mr Volen Siderov, who was also a member of Parliament. Together, these provided a majority of one hundred twentyone out of the total of two hundred and forty members of Parliament. 4. Following parliamentary elections on 12 May 2013, on 29 May 2013 a 

In [31]:
def get_extractive_summary_textrank(parsed_doc, limit_phrases=4, limit_sentences=10):
    sentence_bounds = [[sentence.start, sentence.end, set([])] for sentence in parsed_doc.sents]

    phrase_id = 0
    unit_vector = []
    # get original text according to rank
    for p in parsed_doc._.phrases:
        unit_vector.append(p.rank)

        for chunk in p.chunks:
            for sent_start, sent_end, sent_vector in sentence_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if limit_phrases and phrase_id >= limit_phrases:
            break

    # euclidean distance between phrases, choose those with smallest distance
    unit_vector = np.asarray(unit_vector)
    sum_ranks = np.sum(unit_vector)
    unit_vector /= sum_ranks 
    sent_rank = {}
    sent_id = 0
    for sent_start, sent_end, sent_vector in sentence_bounds:
        sum_sq = 0
        # only add to sum if phrase id not in sent vector so a phrase will not count itself
        sum_sq = np.sqrt(np.sum([unit_vector[phrase_id]*unit_vector[phrase_id] for phrase_id in range(len(unit_vector)) if phrase_id not in sent_vector]))
        sent_rank[sent_id] = sum_sq
        sent_id += 1

    sent_rank = dict(sorted(sent_rank.items(), key=lambda x: x[1]))

    sent_id = 0
    sent_text = {}
    for sentence in parsed_doc.sents:
        sent_text[sent_id] = sentence.text
        sent_id += 1

    limit = 0
    summary = []
    seen_sentences = set()  # to track unique sentences

    for id_sentence in sent_rank.keys():
        sentence = sent_text[id_sentence]
        if sentence not in seen_sentences:  # check for duplicates
            summary.append(sentence)
            seen_sentences.add(sentence)  # add to seen sentences
            limit += 1
        if limit >= limit_sentences:
            break

    return summary

In [None]:
textrank_summaries = []
for fact in cleaned_facts:
    textrank_summary = get_extractive_summary_textrank(nlp(fact), limit_phrases = None, limit_sentences = 10)
    textrank_summaries.append(textrank_summary)
textrank_summaries

## BERT

In [39]:
model_summ = Summarizer("distilbert-base-uncased", hidden_concat = True, hidden = [-1, -2], gpu_id = 0)

In [40]:
def get_extractive_summary_bert(doc, limit_sentences = 10):
    return model_summ(doc, use_first = False, return_as_list = True, num_sentences = limit_sentences) 

In [None]:
bert_summaries = []
for fact in cleaned_facts:
    bert_summary = get_extractive_summary_bert(fact, limit_sentences = 10)
    bert_summaries.append(bert_summary)
bert_summaries

In [None]:
document_info = pd.read_csv('../data/df_dignity_facts_unclean.csv', index_col=0)
document_info = document_info.reset_index(drop = True)
document_info

In [None]:
document_info = document_info.drop(nan_indices, axis=0)
document_info

In [None]:
document_info['textrank_summary'] = textrank_summaries
document_info['bert_summary'] = bert_summaries
document_info

In [43]:
document_info.to_csv('textrank_bert_extractive_document_info.csv')

## Phi-3 Ollama

In [51]:
def generate_summary_phi3(fact):
    
    prompt = f"""
    ## Instructions
    You are a legal expert. Write a concise summary of the following legal text.
    
    ## Legal Text
    Legal Text: {fact}
    """
    
    response = ollama.generate(
        model = 'phi3:mini-128k',
        prompt = prompt,
        options = {"temperature" : 0, "num_predict" : MAX_TOKENS}
    )

    return response['response']

In [52]:
phi3_summaries = []
for fact in cleaned_facts:
    phi3_summaries.append(generate_summary_phi3(fact))

In [53]:
phi3_summaries[0]

" On 28 December 2012, a man from Blagoevgrad painted over the statue of Todor Băgea in the town centre and placed a cap and a sack on it as part of his act. The court found that this was an indecent act amounting to minor hooliganism which breached public order. It also held that fundamental rights could not be exercised by committing acts contrary to the 1963 Decree, thus rejecting the applicant's argument that he had been exercising his right to protest against the government. The court upheld a fine of BGN 50 and costs amounting to BGN 248 imposed by the first-instance court."

In [None]:
document_info['phi3_summaries'] = phi3_summaries
document_info

In [55]:
document_info.to_csv('phi3_textrank_bert_dataset.csv')