1) IMPORTURI ȘI CONFIGURARE

In [1]:
import os
import re
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Căi de fișiere
FILE_PATH_COMPANIES = r"C:\Users\...\ml_insurance_challenge.csv"
FILE_PATH_TAXONOMY  = r"C:\Users\...\insurance_taxonomy.txt"
OUTPUT_FILE_PATH    = r"C:\Users\...\output_refined.csv"

MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# Parametri clasificare
THRESHOLD = 0.4
TOP_K = 2
BOOST_VALUE = 0.015

# Sinonime
SYNONYMS = {
    r"\binsurance\b": "insurance",
    r"\binsured\b":  "insurance",
    r"\bpolicy(ies)?\b": "policy",
    r"\bclaims?\b": "claims"
}

# LABEL_BOOST se adaugă ulterior din variabilă externă dacă e mare
from label_boost import LABEL_BOOST

2. CITIRE FIȘIER

In [5]:
def load_data():
    if not os.path.exists(FILE_PATH_COMPANIES):
        raise FileNotFoundError(f"Fișier companii inexistent: {FILE_PATH_COMPANIES}")
    if not os.path.exists(FILE_PATH_TAXONOMY):
        raise FileNotFoundError(f"Fișier taxonomie inexistent: {FILE_PATH_TAXONOMY}")
    df = pd.read_csv(FILE_PATH_COMPANIES)
    with open(FILE_PATH_TAXONOMY, "r", encoding="utf-8") as f:
        taxonomy_labels = [line.strip() for line in f if line.strip()]
    return df, taxonomy_labels

df_companies, taxonomy_labels = load_data()

3. CURATARE ȘI ENRICHMENT TEXT

In [6]:
TEXT_COLUMNS = ["description", "business_tags", "sector", "category", "niche"]

def clean_and_enrich_text(row):
    parts = []
    for col in TEXT_COLUMNS:
        if col in row and pd.notna(row[col]):
            parts.append(str(row[col]).strip().lower())
    full_text = " ".join(parts)
    for pattern, replacement in SYNONYMS.items():
        full_text = re.sub(pattern, replacement, full_text)
    full_text = re.sub(r"[^a-z0-9\s]", " ", full_text)
    full_text = re.sub(r"\s+", " ", full_text).strip()
    return full_text

4. APLICĂ CURĂȚAREA PE TOT DF

In [8]:
def preprocess_companies(df):
    df["combined_text"] = df.apply(clean_and_enrich_text, axis=1)
    return df

df_companies = preprocess_companies(df_companies)

5. EMBEDDINGS ȘI BOOST

In [9]:
def compute_embeddings_and_scores(df, taxonomy_labels):
    model = SentenceTransformer(MODEL_NAME)
    company_texts = df["combined_text"].tolist()
    company_embeddings = model.encode(company_texts, convert_to_tensor=True, show_progress_bar=True)
    label_embeddings = model.encode(taxonomy_labels, convert_to_tensor=True, show_progress_bar=True)
    return util.cos_sim(company_embeddings, label_embeddings)

cos_sim_matrix = compute_embeddings_and_scores(df_companies, taxonomy_labels)

def apply_rule_based_boost(df, cos_sim_matrix, taxonomy_labels, label_boost_map, boost_value=BOOST_VALUE):
    cos_sim_cpu = cos_sim_matrix.cpu()
    label_to_index = {label: idx for idx, label in enumerate(taxonomy_labels)}
    for i, row in df.iterrows():
        text = row["combined_text"]
        for label, keywords in label_boost_map.items():
            if label not in label_to_index:
                continue
            label_idx = label_to_index[label]
            if any(keyword in text for keyword in keywords):
                cos_sim_cpu[i, label_idx] += boost_value
    return cos_sim_cpu

cos_sim_matrix = apply_rule_based_boost(df_companies, cos_sim_matrix, taxonomy_labels, LABEL_BOOST)

Batches: 100%|██████████| 297/297 [18:01<00:00,  3.64s/it]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


6. CLASIFICARE

In [10]:
def classify_companies(df, cos_sim_matrix, taxonomy_labels, threshold=THRESHOLD, top_k=TOP_K):
    assigned_hybrid, assigned_topk, best_scores = [], [], []
    for i in range(len(df)):
        scores = cos_sim_matrix[i]
        above_thr_idx = torch.where(scores > threshold)[0]
        if len(above_thr_idx) > 0:
            selected_labels = [taxonomy_labels[idx] for idx in above_thr_idx.tolist()]
            assigned_hybrid.append(", ".join(selected_labels))
        else:
            max_score_val, max_idx = torch.max(scores, dim=0)
            assigned_hybrid.append(taxonomy_labels[max_idx.item()])
        sorted_indices = torch.argsort(scores, descending=True)[:top_k]
        assigned_topk.append(", ".join([taxonomy_labels[idx] for idx in sorted_indices]))
        best_scores.append(float(torch.max(scores).item()))
    df["assigned_hybrid"] = assigned_hybrid
    df["assigned_topk"] = assigned_topk
    df["best_score"] = best_scores
    return df

df_companies = classify_companies(df_companies, cos_sim_matrix, taxonomy_labels)

7. SALVARE DATE IN FORMAT CSV

In [13]:
df_companies.to_csv(OUTPUT_FILE_PATH, index=False, encoding="utf-8")
print(f"Rezultate salvate în: {OUTPUT_FILE_PATH}")

Rezultate salvate în: C:\Users\dcata\Desktop\FACULTA\ANUL 3 SEM 2\Internship Veridion\Veridion_challenge\output_refined.csv
