In [1]:
import pandas as pd
import numpy as np

In [2]:
import json
from pathlib import Path

def load_json(file_path, encoding='utf-8', verbose=True, extract_key=None):
    try:
        file_path = Path(file_path)
        
        if verbose:
            print(f"üìÇ Loading: {file_path}")
        
        if not file_path.exists():
            return None
        
        with open(file_path, 'r', encoding=encoding) as f:
            data = json.load(f)
        if extract_key:
            if isinstance(data, dict) and extract_key in data:
                if verbose:
                    print(f"üîë Extracting key: '{extract_key}'")
                data = data[extract_key]
            else:
                print(f"‚ö†Ô∏è Key '{extract_key}' not found in JSON")
        if verbose:
            data_type = type(data).__name__
            
            if isinstance(data, list):
                print(f"‚úÖ Loaded {len(data)} items (list)")
            elif isinstance(data, dict):
                print(f"‚úÖ Loaded dictionary with keys: {list(data.keys())}")
                # N·∫øu dict c√≥ nested list, show th√™m th√¥ng tin
                for key, value in data.items():
                    if isinstance(value, list):
                        print(f"   - {key}: {len(value)} items")
            else:
                print(f"‚úÖ Loaded {data_type}")
        
        return data
        
    except json.JSONDecodeError as e:
        return None
    except Exception as e:
        return None


In [3]:
def load_historical_queries(json_path):

    print(f"üìÇ Loading from: {json_path}")
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"üìä Raw data type: {type(data)}")
    
    # Case 1: Already a list
    if isinstance(data, list):
        queries_list = data
    
    # Case 2: Dict with nested structure
    elif isinstance(data, dict):
        # Try common keys
        for key in ['queries', 'query', 'data', 'results']:
            if key in data:
                queries_list = data[key]
                if not isinstance(queries_list, list):
                    queries_list = [queries_list]
                break
        else:
            # No known key, treat whole dict as single query
            queries_list = [data]
    
    else:
        raise ValueError(f"Unsupported data type: {type(data)}")
    
    print(f"‚úÖ Loaded {len(queries_list)} queries")
    
    # Convert to DataFrame
    df = pd.DataFrame(queries_list)
    
    print(f"üìä DataFrame shape: {df.shape}")
    print(f"üìã Columns: {list(df.columns)}")
    
    return df


In [39]:
df = load_json('../data/processed/data.json')
df = pd.DataFrame(df)
print(df)

üìÇ Loading: ..\data\processed\data.json
‚úÖ Loaded 1018 items (list)
     venueId                        subAddress            phuong  \
0      v-001   2-4 ƒê. s·ªë 9, Khu ƒë√¥ th·ªã Him Lam   Ph∆∞·ªùng T√¢n H∆∞ng   
1      v-001   2-4 ƒê. s·ªë 9, Khu ƒë√¥ th·ªã Him Lam   Ph∆∞·ªùng T√¢n H∆∞ng   
2      v-001   2-4 ƒê. s·ªë 9, Khu ƒë√¥ th·ªã Him Lam   Ph∆∞·ªùng T√¢n H∆∞ng   
3      v-002  ƒê∆∞·ªùng s·ªë 7, Khu d√¢n c∆∞ L·∫≠p Ph√∫c,     X√£ Ph∆∞·ªõc Ki·ªÉn   
4      v-002  ƒê∆∞·ªùng s·ªë 7, Khu d√¢n c∆∞ L·∫≠p Ph√∫c,     X√£ Ph∆∞·ªõc Ki·ªÉn   
...      ...                               ...               ...   
1013   v-248                  35 Tr·∫ßn Qu√Ω Ki√™n  Ph∆∞·ªùng D·ªãch V·ªçng   
1014   v-248                  35 Tr·∫ßn Qu√Ω Ki√™n  Ph∆∞·ªùng D·ªãch V·ªçng   
1015   v-248                  35 Tr·∫ßn Qu√Ω Ki√™n  Ph∆∞·ªùng D·ªãch V·ªçng   
1016   v-248                  35 Tr·∫ßn Qu√Ω Ki√™n  Ph∆∞·ªùng D·ªãch V·ªçng   
1017   v-248                  35 Tr·∫ßn Qu√Ω Ki√™n  Ph∆∞·ªùng D·ªãc

In [None]:
# import re
# import time

# def parse_time(time_str):
#     try:
#         hour, minute = time_str.strip().split(':')
#         return time(int(hour), int(minute))
#     except:
#         return None

In [None]:
# def has_time_overlap(venue_open, venue_close, user_start, user_end):
#     return venue_open <= user_end and venue_close >= user_start

In [None]:

# def check_time_available(start_time, end_time, time_preference):
#     if not start_time or not end_time or not time_preference:
#         return True  # N·∫øu thi·∫øu info ‚Üí assume available
    
#     # Parse time strings: "05:00" ‚Üí time(5, 0)
#     venue_open = parse_time(start_time)
#     venue_close = parse_time(end_time)
    
#     if not venue_open or not venue_close:
#         return True  
    
#     time_ranges = {
#         'morning': (time(5, 0), time(11, 0)),    # 05:00 - 11:00
#         'noon': (time(11, 0), time(14, 0)),      # 11:00 - 14:00
#         'afternoon': (time(14, 0), time(18, 0)), # 14:00 - 18:00
#         'evening': (time(18, 0), time(23, 0)),   # 18:00 - 23:00
#         'night': (time(18, 0), time(23, 59)),    # 18:00 - 23:59
#         'tonight': (time(18, 0), time(23, 59))   # Same as night
#     }
    
#     if time_preference not in time_ranges:
#         return True  
    
#     user_start, user_end = time_ranges[time_preference]
    
#     return has_time_overlap(venue_open, venue_close, user_start, user_end)


In [None]:
# def pair_feature(parsed_query, venue):
#     features = []

#     # -------------------------------- Sport match --------------------------------
#     sport_match = 1 if parsed_query["sport"] == venue.get("sport", "") else 0
#     features.append(sport_match)

#     # -------------------------------- District match -------------------------------
#     district_match = 1 if parsed_query["district"] == venue.get("district", "") else 0
#     features.append(district_match)

#     # -------------------------------- Time match ----------------------------------
#     start_time = venue.get("start_time", "")
#     end_time = venue.get("end_time", "")
#     time_pref = parsed_query["time"]

#     is_available = check_time_available(start_time, end_time, time_pref)
#     time_match = 1 if is_available else 0
#     features.append(time_match)

#     return features


In [None]:
# def calculate_text_similarity(keywords, text):
#     if not keywords:
#         return 0.0
#     text_lower = text.lower()
#     matches = sum(1 for kw in keywords if kw in text_lower)
#     return matches / len(keywords)

In [None]:
# import re
# def parse_user_query(query_text):
#     if not isinstance(query_text, str):
#         query_text = str(query_text)
    
#     query_lower = query_text.lower()
    
#     parsed = {
#         'time': None,
#         'raw_text': query_text,
#         'sport': None,
#         'district': None,
#         'max_price': None,
#         'min_rating': None,
#         'keywords': []
#     }
    
#     # Detect sport
#     sport_keywords = {
#         'badminton': ['c·∫ßu l√¥ng', 'badminton'],
#         'pickleball': ['pickleball', 'pickle'],
#         'football': ['b√≥ng ƒë√°', 'football', 'b√≥ng', 'futsal'],
#     }
    
#     for sport, keywords in sport_keywords.items():
#         if any(kw in query_lower for kw in keywords):
#             parsed['sport'] = sport
#             break
    
#     # Detect district
#     district_pattern = r'qu·∫≠n\s*(\d+)'
#     district_match = re.search(district_pattern, query_lower)
#     if district_match:
#         parsed['district'] = f'Qu·∫≠n {district_match.group(1)}'
    
#     # Detect price
#     price_patterns = [
#         r'(\d+)k',
#         r'd∆∞·ªõi\s*(\d+)',
#         r'(\d+)\s*(?:ngh√¨n|ng√†n)'
#     ]
#     for pattern in price_patterns:
#         price_match = re.search(pattern, query_lower)
#         if price_match:
#             parsed['max_price'] = int(price_match.group(1)) * 1000
#             break
    
#     # Detect quality requirements
#     if 't·ªët' in query_lower or 'ch·∫•t l∆∞·ª£ng' in query_lower:
#         parsed['min_rating'] = 4.0
    
#     # Extract keywords
#     keywords = ['g·∫ßn', 'r·∫ª', 'ch·∫•t l∆∞·ª£ng', 'ƒë·∫πp', 's·∫°ch']
#     parsed['keywords'] = [kw for kw in keywords if kw in query_lower]
    
#     return parsed


In [40]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

# Import ParseUtils from services
from services.parse_user_query import ParseUtils
def create_training_data(venues_df, historical_queries):
    training_data = []
    if not isinstance(historical_queries, pd.DataFrame):
        historical_queries = pd.DataFrame(historical_queries)


    for idx, query_record in historical_queries.iterrows():
        query = query_record["query"]
        clicked_venue_id = query_record.get("clicked_venue_id") or query_record.get("venueId")
        parsed_query = ParseUtils.parse_user_query(query)

        for _, venue_row in venues_df.iterrows():
            features = ParseUtils.pair_feature(parsed_query, venue_row)
            venue_id = venue_row.get("venueId") or venue_row.get("venueid") or venue_row.get("id")
            label = 1 if str(venue_id) == str(clicked_venue_id) else 0
            record = {
                "sport_match": features[0],
                "district_match": features[1],
                "time_match": features[2],
                "label": label,
                "query": query,
                "venue_id": venue_id,
            }

            training_data.append(record)

    df_training = pd.DataFrame(training_data)
    return df_training

In [41]:
import json
import pandas as pd

with open('../data/raw/query.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

if isinstance(data, dict) and 'query' in data:
    historical_queries = pd.DataFrame(data['query'])
elif isinstance(data, list):
    historical_queries = pd.DataFrame(data)
else:
    historical_queries = pd.json_normalize(data)

print(historical_queries)

                                        query  \
0       s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng   
1        s√¢n C·∫ßu l√¥ng Huy·ªán Nh√† B√® bu·ªïi chi·ªÅu   
2           s√¢n Tennis ·ªü Qu·∫≠n 8 v√†o bu·ªïi s√°ng   
3    t√¨m s√¢n B√≥ng R·ªï Qu·∫≠n T√¢n B√¨nh bu·ªïi chi·ªÅu   
4       s√¢n Pickleball Huy·ªán H√≥c M√¥n s√°ng nay   
5      s√¢n B√≥ng chuy·ªÅn Qu·∫≠n G√≤ V·∫•p bu·ªïi chi·ªÅu   
6     s√¢n C·∫ßu l√¥ng Qu·∫≠n T√¢n Ph√∫ v√†o bu·ªïi s√°ng   
7   t√¨m s√¢n Pickleball Qu·∫≠n 12 v√†o bu·ªïi chi·ªÅu   
8                     s√¢n Tennis Qu·∫≠n 11 s√°ng   
9        s√¢n Pickleball Qu·∫≠n 1 v√†o bu·ªïi chi·ªÅu   
10             s√¢n C·∫ßu l√¥ng ƒê·ªëng ƒêa bu·ªïi s√°ng   
11             s√¢n B√≥ng R·ªï H√† ƒê√¥ng bu·ªïi chi·ªÅu   
12                s√¢n B√≥ng chuy·ªÅn T√¢y H·ªì s√°ng   
13    s√¢n Pickleball Qu·∫≠n Ho√†ng Mai bu·ªïi s√°ng   
14             s√¢n Tennis Long Bi√™n chi·ªÅu nay   
15             s√¢n C·∫ßu l√¥ng Qu·∫≠n 9 bu·ªïi chi·ªÅu   
16        t√¨m s√¢n B

In [42]:
df_training = create_training_data (venues_df=df, historical_queries= historical_queries)
print (df_training)

       sport_match  district_match  time_match  label  \
0                0               1           1      1   
1                0               1           1      1   
2                0               1           1      1   
3                0               0           1      0   
4                0               0           1      0   
...            ...             ...         ...    ...   
60057            0               0           1      0   
60058            0               0           1      0   
60059            0               0           1      0   
60060            0               0           1      0   
60061            0               0           1      0   

                                       query venue_id  
0      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
1      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
2      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
3      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-002  

In [27]:
print (df_training)

       sport_match  district_match  time_match  label  \
0                0               1           1      1   
1                0               1           1      1   
2                0               1           1      1   
3                0               0           1      0   
4                0               0           1      0   
...            ...             ...         ...    ...   
60057            0               0           1      0   
60058            0               0           1      0   
60059            0               0           1      0   
60060            0               0           1      0   
60061            0               0           1      0   

                                       query venue_id  
0      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
1      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
2      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
3      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-002  

In [43]:
df_training.to_csv('../datasets/ground.csv')