<a href="https://colab.research.google.com/github/Cody9494/LEGALSKEPSIS-DATA/blob/main/LAWSKPEPSIS_STEP5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Φόρτωση του dataset με τα embeddings
df_chunks = pd.read_parquet("/content/drive/MyDrive/LAWSKEPSIS/df_chunks_with_embeddings.parquet")

# Εξασφάλιση ότι τα embeddings είναι tensor
df_chunks["embedding"] = df_chunks["embedding"].apply(lambda x: torch.tensor(x))


In [None]:
model = SentenceTransformer("intfloat/e5-mistral-7b-instruct")


In [None]:
df_chunks

In [None]:
def semantic_search_grouped_by_act(query, model, df_chunks, top_k=5, aggregation="max"):
    # 1. Ενσωμάτωση ερωτήματος
    query_emb = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)

    # 2. Πίνακας embeddings
    all_embeddings = torch.stack(df_chunks["embedding"].to_list()).to(query_emb.device)

    # 3. Υπολογισμός cosine similarity
    cosine_scores = F.cosine_similarity(query_emb.unsqueeze(0), all_embeddings)

    df_chunks = df_chunks.copy()
    df_chunks["similarity"] = cosine_scores.cpu().numpy()

    # 4. Ομαδοποίηση ανά act_id με aggregation
    if aggregation == "max":
        act_scores = df_chunks.groupby("act_id")["similarity"].max().reset_index()
    elif aggregation == "mean":
        act_scores = df_chunks.groupby("act_id")["similarity"].mean().reset_index()
    elif aggregation == "top3mean":
        act_scores = (
            df_chunks.sort_values("similarity", ascending=False)
            .groupby("act_id")
            .head(3)
            .groupby("act_id")["similarity"].mean()
            .reset_index()
        )
    else:
        raise ValueError("Unknown aggregation method")

    # 5. Επιλογή top-k act_id
    top_acts = act_scores.sort_values("similarity", ascending=False).head(top_k)

    # 6. Ανάκτηση κορυφαίου chunk ανά act
    result_chunks = (
        df_chunks[df_chunks["act_id"].isin(top_acts["act_id"])]
        .sort_values(["act_id", "similarity"], ascending=[True, False])
        .groupby("act_id")
        .head(1)
    )

    return result_chunks[["act_id", "chunk_index", "chunk_text", "similarity"]].reset_index(drop=True)


In [None]:
# 2. Αφαίρεση chunks με λίγους tokens
df_filtered = df_chunks[df_chunks["token_len"] > 30]


# 3. Θορυβώδεις λέξεις-κλειδιά
noise_keywords = ["http", "www.", "europa.eu", "italic", "ELI:", "OJ L", "data.europa.eu", "screen/expert-groups"]

# 4. Υπολογισμός ποσοστού θορύβου ανά chunk
def noise_ratio(text, noise_keywords):
    words = text.split()
    if not words:
        return 1.0
    noise_count = sum(any(kw.lower() in word.lower() for kw in noise_keywords) for word in words)
    return noise_count / len(words)

df_filtered["noise_ratio"] = df_filtered["chunk_text"].apply(lambda txt: noise_ratio(txt, noise_keywords))

# 5. Φιλτράρισμα chunks με >30% θορυβώδεις λέξεις
df_filtered = df_filtered[df_filtered["noise_ratio"] < 0.3].reset_index(drop=True)

# 6. Προβολή δείγματος
import random
sample = df_filtered.sample(3, random_state=42)
pd.set_option("display.max_colwidth", None)
display(sample[["act_id", "chunk_index", "chunk_text", "token_len", "noise_ratio"]])

In [None]:
len(df_filtered)

In [None]:
df_filtered

In [None]:
query = "Can the EU freeze a person’s assets as part of sanctions?"

top_acts = semantic_search_grouped_by_act(
    query,
    model=model,
    df_chunks=df_filtered,  # το noise-filtered df
    top_k=5,
    aggregation="max",  # ή "mean", "top3mean"
)

pd.set_option('display.max_colwidth', None)
display(top_acts)
