In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

with open("/Users/cibylin/Desktop/ticketmaster_queries.txt", "r") as f:
    queries = [line.strip() for line in f if line.strip()]

query_df = pd.DataFrame({"query": queries})
query_df["query_id"] = query_df.index

event_df = pd.read_csv("/Users/cibylin/Desktop/event_ranking_app/ticketmaster.csv")
event_df = event_df.sample(n=300, random_state=42)

df = query_df.merge(event_df, how="cross")

embedder = SentenceTransformer('all-MiniLM-L6-v2')

unique_queries = df["query"].unique()
query_vectors = embedder.encode(unique_queries, convert_to_numpy=True)
query_vec_map = dict(zip(unique_queries, query_vectors))

df["ticketmaster_text"] = df.apply(
    lambda row: f"{row['segmentname']} {row['genrename']} {row['eventname']}  {row['venuecity']}", axis=1
)

unique_texts = df["ticketmaster_text"].unique()
text_vectors = embedder.encode(unique_texts, convert_to_numpy=True, batch_size=32)
text_vec_map = dict(zip(unique_texts, text_vectors))

def compute_similarity(row):
    return 1 - cosine(query_vec_map[row["query"]], text_vec_map[row["ticketmaster_text"]])

df["similarity"] = df.apply(compute_similarity, axis=1).astype("float32")

df["city_match"] = df.apply(
    lambda row: int(str(row["venuecity"]).lower() in str(row["query"]).lower()), axis=1
)

df["segment_match_score"] = df.apply(
    lambda row: sum(1 for word in row["query"].lower().split()
                    if word in str(row["segmentname"]).lower()), axis=1
)

df["genre_match_score"] = df.apply(
    lambda row: sum(1 for word in row["query"].lower().split()
                    if word in str(row["genrename"]).lower()), axis=1
)

def format_date_variants(dt_string):
    try:
        dt_string = dt_string.replace("Z", "")
        dt = datetime.fromisoformat(dt_string)
    except ValueError:
        try:
            dt = datetime.strptime(dt_string, "%Y-%m-%d")
        except ValueError:
            return []
    return [
        dt.strftime("%Y-%m-%d"), 
        dt.strftime("%B %d").lower(),
        dt.strftime("%b %d").lower()
    ]

df["date_match"] = df.apply(
    lambda row: int(
        any(date in row["query"].lower()
            for date in format_date_variants(str(row["eventdatetime"])))
    ), axis=1
)

def score_price(p):
    try:
        p = float(p)
    except:
        return 0
    if p <= 200:
        return 0.5
    elif p <= 400:
        return 1
    elif p <= 550:
        return 2
    elif p <= 680:
        return 3
    else:
        return 0

df["price_score"] = df["pricemax"].astype("float32").apply(score_price)

def compute_weak_label(row):
    score = 0
    if row["similarity"] > 0.8:
        score += 3
    elif row["similarity"] > 0.7:
        score += 2
    elif row["similarity"] > 0.6:
        score += 1
    if row["city_match"]:
        score += 5
    if row["segment_match_score"] == 1:
        score += 3
    if row["genre_match_score"] == 1:
        score += 2
    if row["date_match"]:
        score += 2
    if 2 <= row["price_score"] <= 3:
        score += 1
    if score >= 8:
        return 2
    elif score >= 5:
        return 1
    else:
        return 0

df["weak_label"] = df.apply(compute_weak_label, axis=1)

df[["query","query_id","similarity","weak_label","venuecity", "segmentname", "genrename", "eventname", "eventdatetime", "pricemax",
          "city_match","segment_match_score", "genre_match_score", "date_match", "price_score"
]].to_csv("tm_weak_labeled_data.csv", index=False)


  event_df = pd.read_csv("/Users/cibylin/Desktop/event_ranking_app/ticketmaster.csv")


(24000, 58)