## Recommend exhibitors by answers

import libraries

In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import torch
import pandas as pd
from utils.visitors import (
    load_visitors_answers,
    preprocess_visitor_answers,
    merge_answers_questions
)
from utils.exhibitors import (
    load_exhibitor_categories,
    preprocess_exhibitor_categories,
    load_exhibitors
)
from utils.embedding_model import EmbeddingModel
from sentence_transformers.util import cos_sim
from models.recommend_exhibitors_by_answers import recommend_exhibitors_for_answer

  from .autonotebook import tqdm as notebook_tqdm


Load the required input files

In [3]:
visitors_answers_df = load_visitors_answers()
exhibitor_categories_df = load_exhibitor_categories()

Preprocess visitor answers for generating embeddings

In [4]:
preprocess_visitor_answers_df = preprocess_visitor_answers(visitors_answers_df)
preprocess_visitor_answers_df.head()
preprocess_visitor_answers_df.iloc[16:20]

Unnamed: 0,id,answer,questionId,answer_cleaned
16,611506abc0d46618f87aadef,Sightseeing / Museum / Excursions & Activities,5c8a78336d41a10da4f73253,sightseeing or museum or excursions and activi...
17,5c8a78336d41a10da4f732d6,Other,5c8a78336d41a10da4f73253,other
18,5c8a78336d41a10da4f732be,Educational institution,5c8a78336d41a10da4f73253,educational institution
19,611506a1c0d46618f87aadee,Insurance,5c8a78336d41a10da4f73253,insurance


preprocess exhibitor categories for generating embeddings

In [5]:
normalize_exhibitor_categories_df = preprocess_exhibitor_categories(exhibitor_categories_df)
normalize_exhibitor_categories_df.head()

Unnamed: 0,categoryId,categoryName,categoryName_cleaned
0,52271,1. Accomodation providers,accomodation providers
1,52272,1.1 Hotel / Hotel chain / Inn,hotel or hotel chain or inn
2,52273,1.2 Apartments / Residential hotel,apartments or residential hotel
3,52274,1.3 Hostel / Motel,hostel or motel
4,52275,1.4 Boarding house,boarding house


Convert the cleaned answers and category names to list for embedding

In [6]:
answer_ids = preprocess_visitor_answers_df['id'].tolist()
answer_texts = preprocess_visitor_answers_df['answer_cleaned'].drop_duplicates().tolist()
category_names = normalize_exhibitor_categories_df['categoryName_cleaned'].tolist()

Initialize embedding_model and generate embeddings

In [7]:
embedding_model = EmbeddingModel()
answer_embeddings = embedding_model.embedList(answer_texts)
category_embeddings = embedding_model.embedList(category_names)


Perform cosine similarity to find the semantic similarity between the answers and categories, retain top 3 similar matchings

In [8]:
ans_sim = cos_sim(answer_embeddings, category_embeddings)

  a = torch.tensor(a)


Store the answer and categories matching based on top 3 similarity scores in a CSV file under results

In [9]:
# Get top 3 matches with both IDs
top_k = 3
answer_category_matches = []
for i in range(len(answer_texts)):
    top_indices = torch.topk(ans_sim[i], top_k).indices
    for rank, cat_idx in enumerate(top_indices):
        answer_category_matches.append({
            "answer_id": answer_ids[i],
            "answer_text": answer_texts[i],
            "matched_category": exhibitor_categories_df.iloc[cat_idx.item()]["categoryName"],
            "category_id": str(exhibitor_categories_df.iloc[cat_idx.item()]["categoryId"]),
            "similarity_score": round(ans_sim[i][cat_idx].item(), 4),
            "rank": rank + 1
        })

# Create DataFrame and save for later use
answer_category_matches_df = pd.DataFrame(answer_category_matches)
answer_category_matches_df.to_csv("../results/top_3_category_matches_per_answer.csv", index=False)

Map exhibitors to the answers based on the answer_category_matches

In [11]:
exhibitors_df = load_exhibitors()
top_3_category_per_answer = pd.read_csv("../results/top_3_category_matches_per_answer.csv")

Create mappings to match exhibitors by answers

In [12]:
top_3_category_per_answer["category_id"] = top_3_category_per_answer["category_id"].astype(str)
exhibitors_df["MainCategories"] = exhibitors_df["MainCategories"].astype(str)
exhibitor_categories_df["categoryId"] = exhibitor_categories_df["categoryId"].astype(str)
answer_text_map = visitors_answers_df.set_index("id")["answer"].to_dict()
category_name_map = exhibitor_categories_df.set_index("categoryId")["categoryName"].to_dict()
exhibitor_name_map = exhibitors_df.set_index("exhibitorid")["Name"].to_dict()
exhibitors_df["category_list"] = exhibitors_df["MainCategories"].astype(str).apply(lambda x: x.split("|"))
exhibitor_cat_map  = exhibitors_df.set_index("exhibitorid")["category_list"].to_dict()
answer_cat_score_map = (
    answer_category_matches_df.groupby("answer_id")
    .apply(lambda df: dict(zip(df["category_id"], df["similarity_score"])))
    .to_dict()
)

  .apply(lambda df: dict(zip(df["category_id"], df["similarity_score"])))


Map and exhibitor to answer based on answer_category_matches (semantic match generated earlier). Store the results in a csv file for next tasks 

In [13]:
rows = []
for answer_id, category_scores in answer_cat_score_map.items():
    answer_cats = set(category_scores.keys())
    answer_text = answer_text_map.get(answer_id, "")
    
    for exhibitor_id, exhibitor_cats in exhibitor_cat_map.items():
        matched = set(answer_cats) & set(exhibitor_cats)
        if matched:
            total_score = sum(float(category_scores[cat]) for cat in matched)
            matched_names = [category_name_map.get(cat, "") for cat in matched]
            rows.append({
                "answer_id": answer_id,
                "answer_text": answer_text,
                "exhibitor_id": exhibitor_id,
                "exhibitor_name": exhibitor_name_map.get(exhibitor_id, ""),
                "matched_category_ids": "|".join(matched),
                "matched_category_names": "|".join(matched_names),
                "similarity_score_sum": round(total_score, 4),
                "matched_category_count": len(matched)
            })
# Save result
answer_to_exhibitor_df = pd.DataFrame(rows)
answer_to_exhibitor_df.to_csv("../results/answer_to_exhibitor_mapping.csv", index=False)


Recommend exhibitors based on answers (this uses csv generated from the above steps)

In [15]:
recommend_exhibitors_for_answer("To source products and services")

Unnamed: 0,exhibitor_id,exhibitor_name,similarity_score_sum,matched_category_count,matched_category_names,category_count,penalty_factor,final_score
0,32278,Exotic Holidays Holidays,0.6877,1,6.3 Services & facilities,3,0.94,0.646438
3,92462,Russian Travel Company,0.6877,1,6.3 Services & facilities,6,0.88,0.605176
1,48028,Prime Adventures Journeys,0.7179,1,9.2 Manufacturer / distributor of consumables,11,0.78,0.559962
2,68142,Global Tours Voyages,0.6953,1,11.3 Other services in the sphere of business ...,10,0.8,0.55624
