In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('data/ml_insurance_challenge.csv')
labels = pd.read_csv('data/insurance_taxonomy - insurance_taxonomy.csv')
print(data.shape)
print(labels.shape)

(9494, 5)
(220, 1)


In [4]:
import re
from html import unescape

def soft_clean(s: str) -> str:
    if not s:
        return ""
    s = unescape(s)  # decode HTML entities
    s = re.sub(r'https?://\S+', '<URL>', s)
    s = re.sub(r'\S+@\S+', '<EMAIL>', s)
    # înlocuiește doar multiplele spații, fără a atinge newline-urile
    s = re.sub(r'[ \t]+', ' ', s)
    # normalizează liniile multiple: mai mult de 2 -> doar 2
    s = re.sub(r'\n{3,}', '\n\n', s)
    # curăță spații la capete de linii
    s = re.sub(r' *\n *', '\n', s)
    return s.strip()

In [5]:
text = ''
text += "Passage: This is a company profile\n"
for index, column in enumerate(data.iloc[1]):
    # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
    if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
        items = column[1:-1].split(',')
        text += "Business Tags: "
        for item in items:
            text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
        text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

    else:
        text += f"{data.columns[index].capitalize()}: {column}\n"

text = soft_clean(text)
print(text)


Passage: This is a company profile
Description: Kyoto Vegetable Specialists Uekamo, also known as Iwa-machi, is a company based in Kyoto, Japan that specializes in the sale of vegetables. They have been in business for ten years and offer a collection of vegetable recipes through their Keiō Vegetable Recipe Collection and Online Shop. The company is directly owned by Uekamoo Farm, Uekame Farm, and Lobechi Shijo-hara Farm. They offer a variety of vegetable products, including suguki-zuke and Kamoo eggplant, and also accept production cultivation according to customer requests. Iwaichi Limited Company uses their experience in production and sales to provide tailored vegetables to meet customer needs and also accepts cultivation of products according to their requirements.
Business Tags: Wholesale, Dual-task Movement Products, Cast Iron Products Manufacturer, Manufacturing Technology, Food and Beverage, Rice And Noodles, High-quality Gloss of Cast Iron, Rice Wholesaler, Miscellaneous Crop

In [6]:
label_text = ''
for index, column in enumerate(labels.iloc[0]):
    label_text += "Query: This is an insurance taxonomy describing a type of business. "
    label_text += "Label: " + column
    label_text = soft_clean(label_text)
print(label_text)

Query: This is an insurance taxonomy describing a type of business. Label: Agricultural Equipment Services


In [7]:
# alegem 5 numere aleatoare pentru a vedea cum arată textele
random_indices = np.random.choice(data.index, size=5, replace=False)
for idx in random_indices:
    text = ''
    for index, column in enumerate(data.iloc[idx]):
        text += "Passage: This is a company profile\n"
        # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
        if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
            items = column[1:-1].split(',')
            text += "Business Tags: "
            for item in items:
                text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
            text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

        else:
            text += f"{data.columns[index].capitalize()}: {column}\n"
    text = soft_clean(text)
    print(text)


Passage: This is a company profile
Description: Jesús Guerra is a company that specializes in art, particularly paintings of Jesus Guerra. They offer a variety of art forms including oil, acrylic, watercolor, and acrylic on watercolor. In addition to art, they also provide illustration and design services. Customers can select a section of the "Art" tab to select a specific piece of artwork.
Passage: This is a company profile
Business Tags: Illustration and Design Services, Artistic Services, Watercolor Paintings, Artwork Creation, Acrylic Paintings and Sculptures
Passage: This is a company profile
Sector: Retail
Passage: This is a company profile
Category: Art Galleries
Passage: This is a company profile
Niche: Art Dealers
Passage: This is a company profile
Description: Renderiet is a Swedish company that specializes in creating sophisticated and detailed visualizations and animations with a focus on architecture. Their primary service is to create interior and exterior photorealistic

In [8]:
# la labels
random_indices = np.random.choice(labels.index, size=5, replace=False)
for idx in random_indices:
    label_text = ''
    for index, column in enumerate(labels.iloc[idx]):
        label_text += "Query: This is an insurance taxonomy describing a type of business. "
        label_text += "Label: " + column
    print(label_text)

Query: This is an insurance taxonomy describing a type of business. Label: Social Media Services
Query: This is an insurance taxonomy describing a type of business. Label: Arts Services
Query: This is an insurance taxonomy describing a type of business. Label: Commercial Drain Cleaning
Query: This is an insurance taxonomy describing a type of business. Label: High-Rise Signage Installation
Query: This is an insurance taxonomy describing a type of business. Label: Residential Snow Removal


In [9]:
model = SentenceTransformer("intfloat/e5-base-v2")

In [21]:
labels_dict = {}
for idx in range(labels.shape[0]):
    label_text = ''
    for index, column in enumerate(labels.iloc[idx]):
        label_text += "This is an insurance taxonomy describing a type of business:  "
        label_text += column
    label_text = soft_clean(label_text)
    print(label_text)
    embedding = model.encode(label_text)
    labels_dict[label_text] = embedding

This is an insurance taxonomy describing a type of business: Agricultural Equipment Services
This is an insurance taxonomy describing a type of business: Soil Nutrient Application Services
This is an insurance taxonomy describing a type of business: Pesticide Application Services
This is an insurance taxonomy describing a type of business: Ornamental Plant Nurseries
This is an insurance taxonomy describing a type of business: Landscaping Services
This is an insurance taxonomy describing a type of business: Gardening Services
This is an insurance taxonomy describing a type of business: Tree Services - Pruning / Removal
This is an insurance taxonomy describing a type of business: Veterinary Services
This is an insurance taxonomy describing a type of business: Veterinary Clinics
This is an insurance taxonomy describing a type of business: Pet Boarding Services
This is an insurance taxonomy describing a type of business: Animal Day Care Services
This is an insurance taxonomy describing a t

KeyboardInterrupt: 

In [11]:
labels_dict[list(labels_dict.keys())[0]].shape

(768,)

In [28]:
text = ''
text += "Passage: This is a company profile\n"
for index, column in enumerate(data.iloc[5000]):
    # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
    if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
        items = column[1:-1].split(',')
        text += "Business Tags: "
        for item in items:
            text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
        text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

    else:
        text += f"{data.columns[index].capitalize()}: {column}\n"

text = soft_clean(text)
print(text)
embedding_company = model.encode(text)
print(embedding_company.shape)
print(embedding_company)

Passage: This is a company profile
Description: Medex is a company that specializes in the production and distribution of various skin care products, including the Ultra Svelt patch, dermo patch, Celu-slim model, and Mascarillas. They also offer a range of hair repair products, such as the Montagne Jeunesse Mascarllas, as well as facial and body care products like facial tonics, hair repair tools, and educational games.
Business Tags: Dermo Patches Manufacturer, One Night of Color, Hair Straightener, Product Distribution, Educational Games, Personal Care Products, Label Pins, Facial Treatments
Sector: Manufacturing
Category: Medical Supply Manufacturers
Niche: Surgical and Medical Instrument Manufacturing
(768,)
[-3.74617963e-03 -3.23380567e-02 -2.65102014e-02 -2.04638764e-03
  7.76242986e-02 -2.00053789e-02  1.85155757e-02  4.95621637e-02
 -1.42809711e-02 -4.71715629e-02 -5.82880806e-03  7.57706389e-02
 -2.82986891e-02 -1.28238741e-02 -5.84135912e-02  3.66536267e-02
 -1.09948532e-03  

In [29]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# calculează similaritățile
sims = np.array([cosine_similarity([embedding_company], [emb])[0][0] for emb in labels_dict.values()])

# aplicăm softmax
def softmax(x, temperature=0.1):
    x = np.array(x)
    x = x / temperature  # scade temperatura => distribuție mai "sharp"
    exp_x = np.exp(x - np.max(x))  # stabil numeric
    return exp_x / np.sum(exp_x)

probs = softmax(sims, temperature=0.1)  # ajustează temperatura între 0.05-0.3

# sortare și afișare
prediction = list(labels_dict.keys())
sorted_idx = np.argsort(-probs)

for i in sorted_idx:
    print(f"Label: {prediction[i]}\nSimilarity: {sims[i]:.4f}\nProbability: {probs[i]:.4f}\n")


Label: This is an insurance taxonomy describing a type of business: Pet Grooming Services
Similarity: 0.7874
Probability: 0.0085

Label: This is an insurance taxonomy describing a type of business: Medical Gas Installation Services
Similarity: 0.7835
Probability: 0.0082

Label: This is an insurance taxonomy describing a type of business: Ink Production Services
Similarity: 0.7826
Probability: 0.0081

Label: This is an insurance taxonomy describing a type of business: Carpet Manufacturing Services
Similarity: 0.7802
Probability: 0.0079

Label: This is an insurance taxonomy describing a type of business: Bedding Manufacturing
Similarity: 0.7776
Probability: 0.0077

Label: This is an insurance taxonomy describing a type of business: Window Treatment Manufacturing
Similarity: 0.7770
Probability: 0.0076

Label: This is an insurance taxonomy describing a type of business: Veterinary Health Centers
Similarity: 0.7767
Probability: 0.0076

Label: This is an insurance taxonomy describing a type 