In [None]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import pandas as pd
import itertools
import numpy as np

with open("queries.txt", "r") as f:
    queries = [line.strip() for line in f if line.strip()]

query_df = pd.DataFrame({"query": queries})
query_df["query_id"] = query_df.index

restaurant_df = pd.read_csv("resturant_table.csv")

query_restaurant_df = query_df.merge(restaurant_df, how="cross")

embedder = SentenceTransformer('all-MiniLM-L6-v2') 

def text_to_vec(text, model=embedder):
    return model.encode(text, convert_to_numpy=True)

queries = [line.strip() for line in open("queries.txt") if line.strip()]
query_df = pd.DataFrame({"query": queries})
query_df["query_id"] = query_df.index

query_restaurant_df = query_df.merge(restaurant_df, how="cross")

unique_queries = query_restaurant_df["query"].unique()
query_vec_map = {q: text_to_vec(q) for q in unique_queries}
query_restaurant_df["query_vector"] = query_restaurant_df["query"].map(query_vec_map)

query_restaurant_df["restaurant_text"] = query_restaurant_df.apply(
    lambda row: f"{row['name']} {row['categories_title']} {row['city']}", axis=1
)
unique_texts = query_restaurant_df["restaurant_text"].unique()
rest_vec_map = {t: text_to_vec(t) for t in unique_texts}
query_restaurant_df["restaurant_vector"] = query_restaurant_df["restaurant_text"].map(rest_vec_map)

query_restaurant_df["similarity"] = query_restaurant_df.apply(
    lambda row: 1 - cosine(row["query_vector"], row["restaurant_vector"]), axis=1
)

query_restaurant_df["city_match"] = query_restaurant_df.apply(
    lambda row: int(row["city"].lower() in row["query"].lower()), axis=1
)

query_restaurant_df["open_overnight_score"] = query_restaurant_df.apply(
    lambda row: int(("overnight" in row["query"].lower()) and (str(row["is_overnight"]).lower() == "true")),
    axis=1
)

query_restaurant_df["category_match_score"] = query_restaurant_df.apply(
    lambda row: sum(1 for word in row["query"].lower().split() if word in str(row["categories_title"]).lower()), axis=1
)

query_restaurant_df["price_score"] = query_restaurant_df["price"].apply(
    lambda p: 1 if 1 <= p <= 3 else 0
)

query_restaurant_df["transaction_match_score"] = query_restaurant_df.apply(
    lambda row: sum(1 for word in row["query"].lower().split() if word in str(row["transactions"]).lower()), axis=1
)

def compute_weak_label(row):
    score = 0
    if row["similarity"] > 0.8:
        score += 3
    elif row["similarity"] > 0.7:
        score += 2
    elif row["similarity"] > 0.6:
        score += 1

    if row.get("city_match", 0) == 1:
        score += 5
    if row.get("category_match_score", 0) >= 1:
        score += 4
    if row.get("transaction_match_score", 0) == 1:
        score += 1
    if row.get("open_overnight_score", 0) == 1:
        score += 1

    if row["rating"] >= 4.0:
        score += 1
    if row["review_count"] >= 300:
        score += 1
    if row["price_score"] < 4 & row["price_score"] > 0:
        score += 1

    if score >= 8:
        return 2
    elif score >= 5:
        return 1
    else:
        return 0


query_restaurant_df["weak_label"] = query_restaurant_df.apply(compute_weak_label, axis=1)

query_restaurant_df[[
    "query", "query_id", "similarity", "name", "id",
    "city_match", "category_match_score", "transaction_match_score",
    "open_overnight_score", "price_score", "weak_label",
    "rating", "review_count", "price",  "city", "categories_title", "transactions", 'is_overnight'
]].to_csv("weak_labeled_data(1).csv", index=False)


Unnamed: 0,query,query_id,id,name,rating,review_count,alias,is_overnight,day_of_the_week,end_hour,...,query_vector,restaurant_text,restaurant_vector,similarity,city_match,open_overnight_score,category_match_score,price_score,transaction_match_score,weak_label
0,﻿Find me a highly-rated Italian restaurant nea...,0,PTFxtXS47ZVRCdZIrEWvGw,Golden Boy Pizza,4.3,4624,golden-boy-pizza-san-francisco,"[false, false, false, false, false, false, false]","[0, 1, 2, 3, 4, 5, 6]","[2100, 2100, 2100, 2100, 2300, 2300, 2100]",...,"[0.0042010313, -0.02630279, -0.022063963, 0.07...","Golden Boy Pizza [Pizza, Italian] San Francisco","[-0.038274966, 0.046282046, 0.016366474, 0.027...",0.540757,0,0,2,1,0,0
1,﻿Find me a highly-rated Italian restaurant nea...,0,wEzg_lla5jqAzIXokYnZJA,That's Amore Woodfire Pizza,4.5,271,thats-amore-woodfire-pizza-san-francisco,"[false, false, false, false, false, false, false]","[0, 1, 2, 3, 4, 5, 6]","[2145, 2130, 2145, 2145, 2145, 2145, 2145]",...,"[0.0042010313, -0.02630279, -0.022063963, 0.07...","That's Amore Woodfire Pizza [Pizza, Beer, Wine...","[-0.005504297, 0.046138458, 0.025471289, 0.063...",0.57472,0,0,2,1,0,0
2,﻿Find me a highly-rated Italian restaurant nea...,0,iyo3pjuRb7mTpaDwXnAzuQ,Square Pie Guys,4.3,958,square-pie-guys-san-francisco,"[false, false, false, false, false, false, false]","[0, 1, 2, 3, 4, 5, 6]","[2130, 2130, 2130, 2130, 2230, 2230, 2130]",...,"[0.0042010313, -0.02630279, -0.022063963, 0.07...","Square Pie Guys [Pizza, Chicken Wings, Salad] ...","[0.080457464, 0.022475712, 0.059851225, 0.0425...",0.433345,0,0,1,1,0,0
