In [None]:
import numpy as np
import pandas as pd
import nltk, re
import json
import torch

In [None]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

Load a sentence transformer model

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)
print(embeddings)

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
df_val = pd.read_json("val_judg.jsonl", lines = True, encoding = "utf-8")
df_val

In [None]:
def preprocess_and_split(text):
    if pd.isna(text):
        return []
    
    # Basic cleaning
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+|www.\S+', '', text)
    sentences = sent_tokenize(text)
    
    return sentences


In [None]:
df_val['sentences'] = df_val['Judgment'].apply(preprocess_and_split)

Generate embeddings for the sentences

In [None]:
def get_embeddings_for_sentences(sentences_list):
    embeddings = model.encode(sentences_list)  # Batch encoding - much faster!
    return embeddings

# Get embeddings for each sentence list
df_val['embeddings'] = df_val['sentences'].apply(get_embeddings_for_sentences)

Use cosine similarity for pair of sentences

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def create_similarity_matrix(sentence_vectors):
    n_sentences = len(sentence_vectors)
    sim_mat = np.zeros([n_sentences, n_sentences])
    
    for i in range(n_sentences):
        for j in range(n_sentences):
            if i != j:
                # Reshape for cosine_similarity: (1, embedding_dim)
                sim_mat[i][j] = cosine_similarity(
                    sentence_vectors[i].reshape(1, -1), 
                    sentence_vectors[j].reshape(1, -1)
                )[0, 0]
    
    return sim_mat

In [None]:
df_val['similarity_matrix'] = df_val['embeddings'].apply(create_similarity_matrix)

In [None]:
df_val

Apply Textrank Algorithm

In [None]:
import networkx as nx

In [None]:
def get_ranked_sentences(sentences, sim_mat):
    # Create graph using similarity matrix
    nx_graph = nx.from_numpy_array(sim_mat)
    
    # Calculate PageRank scores
    scores = nx.pagerank(nx_graph)
    
    # Rank sentences by score
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(sentences)), 
        reverse=True
    )
    
    return ranked_sentences

In [None]:
def extract_top_n_sentences(ranked_sentences, n=20):
    num_sentences = min(n, len(ranked_sentences))
    summary = [ranked_sentences[i][1] for i in range(num_sentences)]
    return summary

In [None]:
df_val['ranked_sentences'] = df_val.apply(
    lambda row: get_ranked_sentences(row['sentences'], row['similarity_matrix']),
    axis=1
)

In [None]:
df_val['ranked_sentences'][5]

In [None]:

df_val['summary'] = df_val['ranked_sentences'].apply(lambda x: extract_top_n_sentences(x, n=20))

In [None]:
df_val['summary'][7]

In [None]:
summary_df = pd.DataFrame({
    'ID': df_val['ID'],
    'Judgment': df_val['summary'].apply(lambda x: ' '.join(x))
})

In [None]:
summary_df

In [None]:
# Convert dataframe to JSONL
summary_df.to_json('Validation.jsonl', orient='records', lines=True, force_ascii=False)