In [1]:
import pandas as pd
import torch

data = pd.read_csv('data/ml_insurance_challenge.csv')
labels = pd.read_csv('data/insurance_taxonomy - insurance_taxonomy.csv')
print(data.shape)
print(labels.shape)

(9494, 5)
(220, 1)


In [2]:
candidate_labels = labels['label'].tolist()

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("intfloat/multilingual-e5-base", device="cuda:0")
embeddings_labels = model.encode(candidate_labels, convert_to_tensor=True)

In [3]:
import re
from html import unescape


def soft_clean(s: str) -> str:
    if not s:
        return ""
    s = unescape(s)  # decode HTML entities
    s = re.sub(r'https?://\S+', '<URL>', s)
    s = re.sub(r'\S+@\S+', '<EMAIL>', s)
    # înlocuiește doar multiplele spații, fără a atinge newline-urile
    s = re.sub(r'[ \t]+', ' ', s)
    # normalizează liniile multiple: mai mult de 2 -> doar 2
    s = re.sub(r'\n{3,}', '\n\n', s)
    # curăță spații la capete de linii
    s = re.sub(r' *\n *', '\n', s)
    return s.strip()

text = ''
text += "This is a detailed company profile\n"
for index, column in enumerate(data.iloc[0]):
    # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
    if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
        items = column[1:-1].split(',')
        text += "Business Tags: "
        for item in items:
            text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
        text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

    else:
        text += f"{data.columns[index].capitalize()}: {column}\n"

text = soft_clean(text)
sentence_to_classify = text
print(sentence_to_classify)

#print(candidate_labels)
embeddings_company = model.encode(sentence_to_classify, convert_to_tensor=True)

# similaritate cosinus
scores = util.cos_sim(embeddings_company, embeddings_labels)
# print top 5 scorres

top5 = torch.topk(scores, k=10)
# printam label si scor
top_labels = []
for score, idx in zip(top5.values[0], top5.indices[0]):
    print(f"Label: {candidate_labels[idx]}, Score: {score.item():.4f}")
    top_labels.append(candidate_labels[idx])

This is a detailed company profile
Description: Welchcivils is a civil engineering and construction company that specializes in designing and building utility network connections across the UK. They offer multi-utility solutions that combine electricity, gas, water, and fibre optic installation into a single contract. Their design engineer teams are capable of designing electricity, water and gas networks from existing network connection points to meter locations at the development, as well as project management of reinforcements and diversions. They provide custom connection solutions that take into account any existing assets, maximize the usage of every trench, and meet project deadlines. Welchcivils has considerable expertise installing gas and electricity connections in a variety of market categories, including residential, commercial, and industrial projects, as as well.
Business Tags: Construction Services, Multi-utilities, Utility Network Connections Design and Construction, Wa

In [4]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

Device set to use cuda:0


In [5]:
classifier(sentence_to_classify, top_labels, multi_label=True)

{'sequence': 'This is a detailed company profile\nDescription: Welchcivils is a civil engineering and construction company that specializes in designing and building utility network connections across the UK. They offer multi-utility solutions that combine electricity, gas, water, and fibre optic installation into a single contract. Their design engineer teams are capable of designing electricity, water and gas networks from existing network connection points to meter locations at the development, as well as project management of reinforcements and diversions. They provide custom connection solutions that take into account any existing assets, maximize the usage of every trench, and meet project deadlines. Welchcivils has considerable expertise installing gas and electricity connections in a variety of market categories, including residential, commercial, and industrial projects, as as well.\nBusiness Tags: Construction Services, Multi-utilities, Utility Network Connections Design and 

In [6]:
def classify_company_emb_zs(index):
    text = ''
    text += "This is a detailed company profile\n"
    for index, column in enumerate(data.iloc[index]):
        # dacă e listă de stringuri în formă text (ex: "['a','b','c']")
        if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
            items = column[1:-1].split(',')
            text += "Business Tags: "
            for item in items:
                text += f"{item.strip().strip('\"').strip('\'')}, "  # eliminăm spații și ghilimele
            text = text[:-2] + '\n'  # scoate ultima virgulă, adaugă newline

        else:
            text += f"{data.columns[index].capitalize()}: {column}\n"

    text = soft_clean(text)
    sentence_to_classify = text
    #print(sentence_to_classify)

    #print(candidate_labels)
    embeddings_company = model.encode(sentence_to_classify, convert_to_tensor=True)

    # similaritate cosinus
    scores = util.cos_sim(embeddings_company, embeddings_labels)
    # print top 5 scorres

    top5 = torch.topk(scores, k=10)
    # printam label si scor
    top_labels = []
    for score, idx in zip(top5.values[0], top5.indices[0]):
        #print(f"Label: {candidate_labels[idx]}, Score: {score.item():.4f}")
        top_labels.append(candidate_labels[idx])

    result = classifier(sentence_to_classify, top_labels, multi_label=True)

    return result['labels'][:3]

In [7]:
classify_company_emb_zs(0)

['Gas Installation Services',
 'Commercial Construction Services',
 'Commercial Electrical Services']

In [45]:
# gt = pd.read_csv('data/handmade_gt_10.csv')
# indexes  = gt['index'].to_list()
# predictions = pd.DataFrame(columns=["index", "predicted"])
# for index in indexes:
#     print(f"Index: {index}")
#     prediction = classify_company_emb_zs(index)
#     print(f"Predicted: {prediction}")
#
#     predictions = pd.concat([predictions, pd.DataFrame({"index": [index], "predicted": [prediction] })], ignore_index=True)
#
#     print(f"Ground truth: {gt[gt['index']==index]['labels'].values[0]}")
#     print("--------------------------------------------------")

Index: 4335
Predicted: ['Residential Roofing Services', 'Roofing Services with Heat Application', 'Cable Installation Services']
Ground truth: ['Residential Roofing Services', 'Roofing Services with Heat Application', 'Waterproofing Services']
--------------------------------------------------
Index: 7910
Predicted: ['Accessory Manufacturing', 'Bedding Manufacturing', 'Textile Manufacturing Services']
Ground truth: ['Accessory Manufacturing', 'Apparel Manufacturing', 'Textile Manufacturing Services']
--------------------------------------------------
Index: 7758
Predicted: ['Real Estate Services', 'Testing and Inspection Services', 'HVAC Inspections']
Ground truth: ['Testing and Inspection Services', 'Building Cleaning Services', 'Real Estate Services']
--------------------------------------------------
Index: 3050
Predicted: ['Laboratory Services', 'Health Promotion Services', 'Occupational Health Services']
Ground truth: ['Laboratory Services', 'Health Promotion Services', 'Occupatio

In [47]:
# predictions.to_csv('data/predictions_emb_zs.csv', index=False)

In [8]:
data_copy = data.copy()
data_copy['insurance_label'] = ''
data_copy

Unnamed: 0,description,business_tags,sector,category,niche,insurance_label
0,Welchcivils is a civil engineering and constru...,"['Construction Services', 'Multi-utilities', '...",Services,Civil Engineering Services,Other Heavy and Civil Engineering Construction,
1,"Kyoto Vegetable Specialists Uekamo, also known...","['Wholesale', 'Dual-task Movement Products', '...",Manufacturing,Fruit & Vegetable - Markets & Stores,"Frozen Fruit, Juice, and Vegetable Manufacturing",
2,Loidholdhof Integrative Hofgemeinschaft is a c...,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...",Manufacturing,Farms & Agriculture Production,All Other Miscellaneous Crop Farming,
3,PATAGONIA Chapa Y Pintura is an auto body shop...,"['Automotive Body Repair Services', 'Interior ...",Services,Auto Body Shops,"Automotive Body, Paint, and Interior Repair an...",
4,Stanica WODNA PTTK Swornegacie is a cultural e...,"['Cultural Activities', 'Accommodation Service...",Services,Boat Tours & Cruises,"Scenic and Sightseeing Transportation, Water",
...,...,...,...,...,...,...
9489,"Anhui Zhongxin Electric Co., Ltd. is a high-te...","['Automation Equipment', 'Technical Consulting...",Manufacturing,Electric Supplies & Power Generation,All Other Miscellaneous Electrical Equipment a...,
9490,"TP Material Co.,Ltd. is a company based in the...","['Construction Materials Supplier', 'Construct...",Services,Construction Services,Commercial and Institutional Building Construc...,
9491,Aladiner Cherag is a company that offers a var...,"['Fruit And Vegetables', 'Hand Wash Products',...",Manufacturing,Dairy Products - Farms & Stores,Fluid Milk Manufacturing,
9492,Candor Eeg is a medical care company located i...,"['Stress Tests', 'Hyperventilation', 'Holter M...",Services,Radiology Clinic,Diagnostic Imaging Centers,


In [9]:
data_copy.shape

(9494, 6)

In [19]:
import ast

for i in range(data.shape[0]):
    prediction = classify_company_emb_zs(i)
    prediction = ast.literal_eval(str(prediction))
    text = ', '.join(prediction)
    #print(text)
    data_copy.loc[i, "insurance_label"] = text
    if i % 100 == 0:
        print(f"Processed {i} records")
        # salvăm progresul
        data_copy.to_csv('data/challenge_result.csv', index=False)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed 0 records
Processed 100 records
Processed 200 records


KeyboardInterrupt: 

In [22]:
def classify_companies_batch(indices, batch_size=8):
    """Procesează mai multe companii deodată"""
    texts = []
    for idx in indices:
        text = 'This is a detailed company profile\n'
        for col_idx, column in enumerate(data.iloc[idx]):
            if isinstance(column, str) and column.startswith('[') and column.endswith(']'):
                items = column[1:-1].split(',')
                text += "Business Tags: "
                for item in items:
                    text += f"{item.strip().strip('\"').strip('\'')}, "
                text = text[:-2] + '\n'
            else:
                text += f"{data.columns[col_idx].capitalize()}: {column}\n"
        texts.append(soft_clean(text))

    # Procesează toate textele în batch pentru embeddings
    embeddings_companies = model.encode(texts, convert_to_tensor=True, batch_size=batch_size)

    # Găsește top labels pentru fiecare text
    all_top_labels = []
    for emb in embeddings_companies:
        scores = util.cos_sim(emb.unsqueeze(0), embeddings_labels)
        top5 = torch.topk(scores, k=10)
        top_labels = [candidate_labels[idx.item()] for idx in top5.indices[0]]
        all_top_labels.append(top_labels)

    # Clasificare individuală (pipeline-ul nu acceptă batch cu candidate_labels diferite)
    results = []
    for text, top_labels in zip(texts, all_top_labels):
        classification = classifier(text, top_labels, multi_label=True)
        results.append(classification['labels'][:3])

    return results


In [31]:
batch_size = 16
for start_idx in range(0, data.shape[0], batch_size):
    end_idx = min(start_idx + batch_size, data.shape[0])
    batch_indices = list(range(start_idx, end_idx))

    predictions = classify_companies_batch(batch_indices, batch_size=batch_size)

    for i, pred in zip(batch_indices, predictions):
        text = ', '.join(pred)
        data_copy.loc[i, "insurance_label"] = text

    if start_idx % 512 == 0:
        print(f"Processed {start_idx} records")
        data_copy.to_csv('data/challenge_result_par.csv', index=False)

Processed 0 records
Processed 512 records
Processed 1024 records
Processed 1536 records
Processed 2048 records
Processed 2560 records
Processed 3072 records
Processed 3584 records
Processed 4096 records
Processed 4608 records
Processed 5120 records
Processed 5632 records
Processed 6144 records
Processed 6656 records
Processed 7168 records
Processed 7680 records
Processed 8192 records
Processed 8704 records
Processed 9216 records


In [32]:
data_copy.to_csv('data/challenge_result_par.csv', index=False)


In [37]:
# randul 345 din data copy

data_copy.iloc[4301]

'''
Wuhan Huaxi Airsprings Co., Ltd. is a Chinese company that specializes in the sale of metal products, including colored metal alloy, new metal functional materials, high-performance metal and alloy materials, metal tools, and hardware products. The company also engages in wholesale and retailing of hardware products, with a focus on non-permanent and limited-liability projects that comply with self-employment laws. Huaxin Airsprays Co., Limited, the parent company, has been committed to continuously improving its product services and promoting innovation in the industry since its establishment, and has attracted the attention and cooperation of many international well-known suppliers. The main products offered by the company include metal tools and hardware tools.
'''

description        Wuhan Huaxi Airsprings Co., Ltd. is a Chinese ...
business_tags      ['Metal Tooling Sales Services Services', 'Sal...
sector                                                 Manufacturing
category                                                Scrap Metals
niche              Nonferrous Metal (except Copper and Aluminum) ...
insurance_label    Non-Structural Steel Fabrication, Sheet Metal ...
Name: 4301, dtype: object