In [2]:
import pandas as pd
import numpy as np

In [3]:
import json
from pathlib import Path

def load_json(file_path, encoding='utf-8', verbose=True, extract_key=None):
    try:
        file_path = Path(file_path)
        
        if verbose:
            print(f"üìÇ Loading: {file_path}")
        
        if not file_path.exists():
            return None
        
        with open(file_path, 'r', encoding=encoding) as f:
            data = json.load(f)
        if extract_key:
            if isinstance(data, dict) and extract_key in data:
                if verbose:
                    print(f"üîë Extracting key: '{extract_key}'")
                data = data[extract_key]
            else:
                print(f"‚ö†Ô∏è Key '{extract_key}' not found in JSON")
        if verbose:
            data_type = type(data).__name__
            
            if isinstance(data, list):
                print(f"‚úÖ Loaded {len(data)} items (list)")
            elif isinstance(data, dict):
                print(f"‚úÖ Loaded dictionary with keys: {list(data.keys())}")
                # N·∫øu dict c√≥ nested list, show th√™m th√¥ng tin
                for key, value in data.items():
                    if isinstance(value, list):
                        print(f"   - {key}: {len(value)} items")
            else:
                print(f"‚úÖ Loaded {data_type}")
        
        return data
        
    except json.JSONDecodeError as e:
        return None
    except Exception as e:
        return None


In [4]:
def load_historical_queries(json_path):

    print(f"üìÇ Loading from: {json_path}")
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"üìä Raw data type: {type(data)}")
    
    # Case 1: Already a list
    if isinstance(data, list):
        queries_list = data
    
    # Case 2: Dict with nested structure
    elif isinstance(data, dict):
        # Try common keys
        for key in ['queries', 'query', 'data', 'results']:
            if key in data:
                queries_list = data[key]
                if not isinstance(queries_list, list):
                    queries_list = [queries_list]
                break
        else:
            # No known key, treat whole dict as single query
            queries_list = [data]
    
    else:
        raise ValueError(f"Unsupported data type: {type(data)}")
    
    print(f"‚úÖ Loaded {len(queries_list)} queries")
    
    # Convert to DataFrame
    df = pd.DataFrame(queries_list)
    
    print(f"üìä DataFrame shape: {df.shape}")
    print(f"üìã Columns: {list(df.columns)}")
    
    return df


In [36]:
df = load_json('../data/processed/data.json')
df = pd.DataFrame(df)
df = df.drop(columns='subAddress')
print(df)

üìÇ Loading: ..\data\processed\data.json
‚úÖ Loaded 1018 items (list)
     venueId            phuong       district                   city  \
0      v-001   Ph∆∞·ªùng T√¢n H∆∞ng         Qu·∫≠n 7  Th√†nh ph·ªë H·ªì Ch√≠ Minh   
1      v-001   Ph∆∞·ªùng T√¢n H∆∞ng         Qu·∫≠n 7  Th√†nh ph·ªë H·ªì Ch√≠ Minh   
2      v-001   Ph∆∞·ªùng T√¢n H∆∞ng         Qu·∫≠n 7  Th√†nh ph·ªë H·ªì Ch√≠ Minh   
3      v-002     X√£ Ph∆∞·ªõc Ki·ªÉn   Huy·ªán Nh√† B√®  Th√†nh ph·ªë H·ªì Ch√≠ Minh   
4      v-002     X√£ Ph∆∞·ªõc Ki·ªÉn   Huy·ªán Nh√† B√®  Th√†nh ph·ªë H·ªì Ch√≠ Minh   
...      ...               ...            ...                    ...   
1013   v-248  Ph∆∞·ªùng D·ªãch V·ªçng  Qu·∫≠n C·∫ßu Gi·∫•y       Th√†nh ph·ªë H√† N·ªôi   
1014   v-248  Ph∆∞·ªùng D·ªãch V·ªçng  Qu·∫≠n C·∫ßu Gi·∫•y       Th√†nh ph·ªë H√† N·ªôi   
1015   v-248  Ph∆∞·ªùng D·ªãch V·ªçng  Qu·∫≠n C·∫ßu Gi·∫•y       Th√†nh ph·ªë H√† N·ªôi   
1016   v-248  Ph∆∞·ªùng D·ªãch V·ªçng  Qu·∫≠n C·∫ßu Gi·∫•y       Th√†nh ph·ªë H√†

In [38]:
import sys
import os

from services.data_services.extract_category import ExtractCategory
sys.path.append(os.path.dirname(os.getcwd()))
from services.data_services.detect_district import DetectDistrict
from services.query_service.extract_category import ExtractSport
import sys
import os
import importlib

# L·∫•y th∆∞ m·ª•c g·ªëc project (th∆∞ m·ª•c ch·ª©a "services")
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

# Reload module ƒë·ªÉ ch·∫Øc ch·∫Øn l·∫•y file m·ªõi
import services.query_service.extract_category as extractSport
import services.query_service.extract_location as extract_location
import services.query_service.extract_time as extract_time
importlib.reload(extract_location)
importlib.reload(extractSport)
importlib.reload(extract_time)
from services.query_service.extract_location import ExtractLocation
from services.query_service.extract_category import ExtractSport
from services.query_service.extract_time import ExtractTime

df['phuong'] = df.apply(DetectDistrict.get_ward_info, axis=1)
df['district'] = df.apply(DetectDistrict.get_district_info, axis=1)
df['city'] = df.apply(DetectDistrict.get_city_info, axis=1)
df = df.drop(columns='category')
print (df)

     venueId      phuong  district         city  \
0      v-001    T√¢n H∆∞ng         7  H·ªì Ch√≠ Minh   
1      v-001    T√¢n H∆∞ng         7  H·ªì Ch√≠ Minh   
2      v-001    T√¢n H∆∞ng         7  H·ªì Ch√≠ Minh   
3      v-002  Ph∆∞·ªõc Ki·ªÉn    Nh√† B√®  H·ªì Ch√≠ Minh   
4      v-002  Ph∆∞·ªõc Ki·ªÉn    Nh√† B√®  H·ªì Ch√≠ Minh   
...      ...         ...       ...          ...   
1013   v-248   D·ªãch V·ªçng  C·∫ßu Gi·∫•y       H√† N·ªôi   
1014   v-248   D·ªãch V·ªçng  C·∫ßu Gi·∫•y       H√† N·ªôi   
1015   v-248   D·ªãch V·ªçng  C·∫ßu Gi·∫•y       H√† N·ªôi   
1016   v-248   D·ªãch V·ªçng  C·∫ßu Gi·∫•y       H√† N·ªôi   
1017   v-248   D·ªãch V·ªçng  C·∫ßu Gi·∫•y       H√† N·ªôi   

                                categoryId start_time end_time  
0     a1f3b6e4-2c9a-4c1b-b8a2-9c0d4fe71201      05:00    22:00  
1     a1f3b6e4-2c9a-4c1b-b8a2-9c0d4fe71201      05:00    22:00  
2     a1f3b6e4-2c9a-4c1b-b8a2-9c0d4fe71201      05:00    22:00  
3     a1f3b6e4-2c9a-4c1b-b8a2-9c0d4fe

In [41]:
import sys
import importlib

# X√≥a module kh·ªèi cache
sys.modules.pop('services.parse_user_query', None)

In [42]:
# Reload module
import services.parse_user_query as parse_module
importlib.reload(parse_module)

from services.parse_user_query import ParseUtils

In [43]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np

def create_training_data(venues_df, historical_queries):
    training_data = []
    if not isinstance(historical_queries, pd.DataFrame):
        historical_queries = pd.DataFrame(historical_queries)


    for idx, query_record in historical_queries.iterrows():
        query = query_record["query"]
        clicked_venue_id = query_record.get("clicked_venue_id") or query_record.get("venueId")
        for _, venue_row in venues_df.iterrows():
            
            features = ParseUtils.pair_feature(parsed_query=query_record, venue_row=venue_row)
            # default_features = ["", "", "", "", ""]
            # features = features + default_features[len(features):]
            venue_id = venue_row.get("venueId") or venue_row.get("venueid") or venue_row.get("id")
            label = 1 if str(venue_id) == str(clicked_venue_id) else 0
            record = {
                "sport_match": features['sport_match'],
                "location_match" : features['location_match'],
                "time_match": features['time_match'],
                "label": label,
                "query": query,
                "venue_id": venue_id,
            }

            training_data.append(record)

    df_training = pd.DataFrame(training_data)
    return df_training

In [33]:
import json
import pandas as pd

with open('../data/raw/query.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

if isinstance(data, dict) and 'query' in data:
    historical_queries = pd.DataFrame(data['query'])
elif isinstance(data, list):
    historical_queries = pd.DataFrame(data)
else:
    historical_queries = pd.json_normalize(data)

print(historical_queries)

                                        query  \
0       s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng   
1        s√¢n C·∫ßu l√¥ng Huy·ªán Nh√† B√® bu·ªïi chi·ªÅu   
2           s√¢n Tennis ·ªü Qu·∫≠n 8 v√†o bu·ªïi s√°ng   
3    t√¨m s√¢n B√≥ng R·ªï Qu·∫≠n T√¢n B√¨nh bu·ªïi chi·ªÅu   
4       s√¢n Pickleball Huy·ªán H√≥c M√¥n s√°ng nay   
5      s√¢n B√≥ng chuy·ªÅn Qu·∫≠n G√≤ V·∫•p bu·ªïi chi·ªÅu   
6     s√¢n C·∫ßu l√¥ng Qu·∫≠n T√¢n Ph√∫ v√†o bu·ªïi s√°ng   
7   t√¨m s√¢n Pickleball Qu·∫≠n 12 v√†o bu·ªïi chi·ªÅu   
8                     s√¢n Tennis Qu·∫≠n 11 s√°ng   
9        s√¢n Pickleball Qu·∫≠n 1 v√†o bu·ªïi chi·ªÅu   
10             s√¢n C·∫ßu l√¥ng ƒê·ªëng ƒêa bu·ªïi s√°ng   
11             s√¢n B√≥ng R·ªï H√† ƒê√¥ng bu·ªïi chi·ªÅu   
12                s√¢n B√≥ng chuy·ªÅn T√¢y H·ªì s√°ng   
13    s√¢n Pickleball Qu·∫≠n Ho√†ng Mai bu·ªïi s√°ng   
14             s√¢n Tennis Long Bi√™n chi·ªÅu nay   
15             s√¢n C·∫ßu l√¥ng Qu·∫≠n 9 bu·ªïi chi·ªÅu   
16        t√¨m s√¢n B

In [40]:
import sys
import os
import importlib

# L·∫•y th∆∞ m·ª•c g·ªëc project (th∆∞ m·ª•c ch·ª©a "services")
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)

# Reload module ƒë·ªÉ ch·∫Øc ch·∫Øn l·∫•y file m·ªõi
import services.query_service.extract_category as extractSport
import services.query_service.extract_location as extract_location
import services.query_service.extract_time as extract_time
importlib.reload(extract_location)
importlib.reload(extractSport)
importlib.reload(extract_time)
from services.query_service.extract_location import ExtractLocation
from services.query_service.extract_category import ExtractSport
from services.query_service.extract_time import ExtractTime



historical_queries = ExtractLocation.merge_query_with_locations(historical_queries)

historical_queries = ExtractSport.extract_category_from_df(historical_queries)
historical_queries = ExtractTime.extract_time_from_df(historical_queries)
historical_queries.head()




Unnamed: 0,query,viewed_venues,clicked_venue_id,booked,phuong,district,city,category,time
0,s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng,"[v-001, v-081, v-029, v-217]",v-001,True,,7,H·ªì Ch√≠ Minh,e3a7b9d9-5c2f-4d33-b4c4-33f1d765cd02,morning
1,s√¢n C·∫ßu l√¥ng Huy·ªán Nh√† B√® bu·ªïi chi·ªÅu,"[v-002, v-081, v-014]",v-002,False,,Nh√† B√®,H·ªì Ch√≠ Minh,a1f3b6e4-2c9a-4c1b-b8a2-9c0d4fe71201,afternoon
2,s√¢n Tennis ·ªü Qu·∫≠n 8 v√†o bu·ªïi s√°ng,"[v-003, v-024, v-079, v-080]",v-003,True,,8,H·ªì Ch√≠ Minh,e3a7b9d9-5c2f-4d33-b4c4-33f1d765cd04,morning
3,t√¨m s√¢n B√≥ng R·ªï Qu·∫≠n T√¢n B√¨nh bu·ªïi chi·ªÅu,"[v-004, v-008, v-009, v-055]",v-004,True,,T√¢n B√¨nh,H·ªì Ch√≠ Minh,e3a7b9d9-5c2f-4d33-b4c4-33f1d765cd03,afternoon
4,s√¢n Pickleball Huy·ªán H√≥c M√¥n s√°ng nay,"[v-005, v-006, v-007]",v-005,False,,H√≥c M√¥n,H·ªì Ch√≠ Minh,e3a7b9d9-5c2f-4d33-b4c4-33f1d765cd02,morning


In [44]:
df_training = create_training_data (venues_df=df, historical_queries= historical_queries)


In [45]:
print (df_training)

       sport_match  location_match  time_match  label  \
0                0               1           1      1   
1                0               1           1      1   
2                0               1           1      1   
3                0               1           1      0   
4                0               1           1      0   
...            ...             ...         ...    ...   
60057            0               0           1      0   
60058            0               0           1      0   
60059            0               0           1      0   
60060            0               0           1      0   
60061            0               0           1      0   

                                       query venue_id  
0      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
1      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
2      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-001  
3      s√¢n Pickleball ·ªü Qu·∫≠n 7 v√†o bu·ªïi s√°ng    v-002  

In [46]:
df_training.to_csv('../datasets/ground.csv')
historical_queries.to_csv('../datasets/historical_queries.csv')
df.to_csv('../datasets/df.csv')