In [1]:
import pandas as pd
import torch

In [2]:
data = pd.read_csv('data/ml_insurance_challenge.csv')
labels = pd.read_csv('data/insurance_taxonomy - insurance_taxonomy.csv')
print(data.shape)
print(labels.shape)

(9494, 5)
(220, 1)


In [3]:
import re
from html import unescape

def soft_clean(s: str) -> str:
    if not s:
        return ""
    s = unescape(s)  # decode HTML entities
    s = re.sub(r'https?://\S+', '<URL>', s)
    s = re.sub(r'\S+@\S+', '<EMAIL>', s)
    # înlocuiește doar multiplele spații, fără a atinge newline-urile
    s = re.sub(r'[ \t]+', ' ', s)
    # normalizează liniile multiple: mai mult de 2 -> doar 2
    s = re.sub(r'\n{3,}', '\n\n', s)
    # curăță spații la capete de linii
    s = re.sub(r' *\n *', '\n', s)
    return s.strip()

In [69]:
for label in labels['label']:
    print(label)

Agricultural Equipment Services
Soil Nutrient Application Services
Pesticide Application Services
Ornamental Plant Nurseries
Landscaping Services
Gardening Services
Tree Services - Pruning / Removal
Veterinary Services
Veterinary Clinics
Pet Boarding Services
Animal Day Care Services
Pet Grooming Services
Animal Training Services
Veterinary Health Centers
Animal Trainers
Livestock Dealer Services
Timber Harvesting Operations
Fishing and Hunting Services
Well Maintenance Services
Field Welding Services
Sand and Gravel Mining
Residential Driveway Construction
Commercial Driveway Construction
Fencing Construction Services
Sidewalk Construction Services
Commercial Irrigation Systems
Residential Drainage Systems
Residential Snow Removal
Commercial Snow Removal
General Snow Removal Services
Land Leveling Services
Residential Drain Cleaning
Commercial Drain Cleaning
Street Cleaning Operations
Conveyor System Installation
Low-Rise Signage Installation
High-Rise Signage Installation
Tank Instal

In [71]:
text = ''
text += "This is a detailed company profile\n"
for index, column in enumerate(data.iloc[5]):
    # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
    if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
        items = column[1:-1].split(',')
        text += "Business Tags: "
        for item in items:
            text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
        text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

    else:
        text += f"{data.columns[index].capitalize()}: {column}\n"

text = soft_clean(text)
sentence_to_classify = text
print(sentence_to_classify)
candidate_labels = labels['label'].tolist()
# adaugam "This is a insurance taxonomy describing a type of business. " la fiecare label
#candidate_labels = [f"Type of business: {label}" for label in candidate_labels]
# adaugam "This is a insurance taxonomy describing a type of business. " la fiecare label

This is a detailed company profile
Description: BIQ Benefícios is a Brazilian company that specializes in generating benefits for both individuals and businesses. With a structure of three large companies in the food sector and 20 years of experience, BIQ Beneficios has extensive knowledge in this field. The company aims to revolutionize the market through quality services for its customers, accredited employees, and collaborators. BIQ Benefits is duly accredited by the PAT (Worker Food Program) and offers a wide network of accredited establishments that provides employees with the freedom to choose healthy food options. The BIQ Duo card is perfect for those who travel frequently. The app allows users to perform balance queries and statements of BIQ cards, as well as see the average daily spending so that their balance lasts until the end of the month.
Business Tags: Healthy Food Options, Accredited Establishments, Daily Spending Tracking Service, Benefits and Payment Solutions, Balanc

In [72]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("intfloat/multilingual-e5-base", device="cuda:0")
embeddings_labels = model.encode(candidate_labels, convert_to_tensor=True)
candidate_labels

['Agricultural Equipment Services',
 'Soil Nutrient Application Services',
 'Pesticide Application Services',
 'Ornamental Plant Nurseries',
 'Landscaping Services',
 'Gardening Services',
 'Tree Services - Pruning / Removal',
 'Veterinary Services',
 'Veterinary Clinics',
 'Pet Boarding Services',
 'Animal Day Care Services',
 'Pet Grooming Services',
 'Animal Training Services',
 'Veterinary Health Centers',
 'Animal Trainers',
 'Livestock Dealer Services',
 'Timber Harvesting Operations',
 'Fishing and Hunting Services',
 'Well Maintenance Services',
 'Field Welding Services',
 'Sand and Gravel Mining',
 'Residential Driveway Construction',
 'Commercial Driveway Construction',
 'Fencing Construction Services',
 'Sidewalk Construction Services',
 'Commercial Irrigation Systems',
 'Residential Drainage Systems',
 'Residential Snow Removal',
 'Commercial Snow Removal',
 'General Snow Removal Services',
 'Land Leveling Services',
 'Residential Drain Cleaning',
 'Commercial Drain Cleanin

In [73]:
embeddings_company = model.encode(sentence_to_classify, convert_to_tensor=True)

# similaritate cosinus
scores = util.cos_sim(embeddings_company, embeddings_labels)
# print top 5 scorres

top5 = torch.topk(scores, k=5)
# printam label si scor
for score, idx in zip(top5.values[0], top5.indices[0]):
    print(f"Label: {candidate_labels[idx]}, Score: {score.item():.4f}")

Label: Food Processing Services, Score: 0.8199
Label: Business Development Services, Score: 0.8197
Label: Data Analysis Services, Score: 0.8126
Label: Financial Services, Score: 0.8113
Label: Grain Processing Services, Score: 0.8103


In [45]:
embeddings_company = model.encode(sentence_to_classify, convert_to_tensor=True)

# similaritate cosinus
scores = util.cos_sim(embeddings_company, embeddings_labels)
# print top 5 scorres

top5 = torch.topk(scores, k=5)
# printam label si scor
for score, idx in zip(top5.values[0], top5.indices[0]):
    print(f"Label: {candidate_labels[idx]}, Score: {score.item():.4f}")

Label: Technology Consulting, Score: 0.3824
Label: Media Production Services, Score: 0.3469
Label: Welding Services, Score: 0.3168
Label: Field Welding Services, Score: 0.3106
Label: Corporate Training Services, Score: 0.2967
