In [2]:
import os
import ast
import json
import numpy as np
import pandas as pd
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# --- 1. Data Cleaning & Feature Engineering Functions ---

def clean_dataset(df):
    cleaned_df = df.copy()
    numeric_columns = {
        'stars_review': float,
        'useful': int,
        'funny': int,
        'cool': int,
        'latitude': float,
        'longitude': float,
        'stars_business': float,
        'review_count': int,
        'is_open': int
    }
    for col, dtype in numeric_columns.items():
        cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')

    cleaned_df['date'] = pd.to_datetime(cleaned_df['date'], errors='coerce')

    def parse_attributes(attr_str):
        if pd.isna(attr_str):
            return {}
        try:
            return ast.literal_eval(str(attr_str))
        except:
            try:
                return json.loads(str(attr_str).replace("'", '"'))
            except:
                return {}
    cleaned_df['attributes'] = cleaned_df['attributes'].apply(parse_attributes)

    cleaned_df['price_range'] = cleaned_df['attributes'].apply(
        lambda x: x.get('RestaurantsPriceRange2') if isinstance(x, dict) else None
    )
    cleaned_df['price_range'] = pd.to_numeric(cleaned_df['price_range'], errors='coerce').fillna(0)

    cleaned_df['categories'] = cleaned_df['categories'].fillna('')
    cleaned_df['category_list'] = cleaned_df['categories'].str.split(', ')

    return cleaned_df

def create_advanced_features(df):
    features = pd.DataFrame()
    features['business_rating'] = df['stars_business']  # (will be dropped later for training)
    features['review_count'] = df['review_count']
    features['price_range'] = df['price_range']
    features['is_open'] = df['is_open']

    features['review_rating'] = df['stars_review']  # (will be dropped later)
    features['review_useful'] = df['useful']
    features['review_funny'] = df['funny']
    features['review_cool'] = df['cool']

    features['review_engagement'] = (features['review_useful'] +
                                     features['review_funny'] +
                                     features['review_cool'])

    features['value_score'] = np.where(
        features['price_range'] > 0,
        features['business_rating'] / features['price_range'],
        features['business_rating']
    )

    features['popularity_score'] = np.log1p(features['review_count'])

    features['rating_deviation'] = abs(features['review_rating'] - features['business_rating'])

    features['latitude'] = df['latitude']
    features['longitude'] = df['longitude']

    coords = df[['latitude', 'longitude']].values
    kmeans = KMeans(n_clusters=min(50, len(df)), random_state=42)
    features['location_cluster'] = kmeans.fit_predict(coords)

    return features

def create_text_features(df, max_features=100):
    tfidf = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        ngram_range=(1, 2)
    )
    text_data = df['text'].fillna('') + ' ' + df['categories'].fillna('')
    text_features = tfidf.fit_transform(text_data)
    text_features_df = pd.DataFrame(
        text_features.toarray(),
        columns=[f'text_feature_{i}' for i in range(max_features)]
    )
    return text_features_df, tfidf

def prepare_final_dataset(merged_df):
    print("Cleaning dataset...")
    cleaned_df = clean_dataset(merged_df)

    print("Creating advanced features...")
    features_df = create_advanced_features(cleaned_df)

    print("Creating text features...")
    # Unpack the tuple: use only the text features DataFrame
    text_features_df, _ = create_text_features(cleaned_df)

    print("Combining features...")
    final_features = pd.concat([features_df, text_features_df], axis=1)

    return final_features, cleaned_df

def prepare_training_data(merged_df):
    final_features, cleaned_df = prepare_final_dataset(merged_df)
    cleaned_df['ranking_score'] = cleaned_df['stars_business'] * np.log1p(cleaned_df['review_count'])
    features_to_drop = ['business_rating', 'review_rating', 'value_score', 'rating_deviation']
    X = final_features.drop(columns=[col for col in features_to_drop if col in final_features.columns], errors='ignore')
    y = cleaned_df['ranking_score']
    return X, y

# --- 2. Create and Save the Training Dataset and TF-IDF Vectorizer ---

# Set your data folder path (make sure this folder exists)
data_folder = "/Users/benben/workspace/NUS_Projects/food-recommender/data"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Read the merged CSV file
merged_file = "/Users/benben/workspace/NUS_Projects/food-recommender/data/merged_food_reviews.csv"
merged_file_df = pd.read_csv(merged_file)
print("Original dataset shape:", merged_file_df.shape)

print("\nPreparing training data...")
X, y = prepare_training_data(merged_file_df)
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

# Save the training data to the data folder
data_filename = os.path.join(data_folder, "training_data.pkl")
with open(data_filename, 'wb') as f:
    pickle.dump((X, y), f)
print(f"\nTraining data saved to: {data_filename}")

# Re-create cleaned_df from merged_file_df for saving the TF-IDF vectorizer
cleaned_df = clean_dataset(merged_file_df)
_, fitted_tfidf = create_text_features(cleaned_df)
tfidf_filename = os.path.join(data_folder, "tfidf_vectorizer.pkl")
with open(tfidf_filename, "wb") as f:
    pickle.dump(fitted_tfidf, f)
print(f"TF-IDF vectorizer saved to: {tfidf_filename}")

Original dataset shape: (5222904, 22)

Preparing training data...
Cleaning dataset...
Creating advanced features...
Creating text features...
Combining features...
Feature matrix shape: (5222904, 111)
Target vector shape: (5222904,)

Training data saved to: /Users/benben/workspace/NUS_Projects/food-recommender/data/training_data.pkl
TF-IDF vectorizer saved to: /Users/benben/workspace/NUS_Projects/food-recommender/data/tfidf_vectorizer.pkl


In [3]:
import pickle

file_path = "../data/training_data.pkl"

with open(file_path, "rb") as f:
    data = pickle.load(f)  # Load the entire object

print(type(data))  # Check the type of the data
print(len(data) if hasattr(data, '__len__') else "No length attribute")  # If it's a list or dict, print length

# If it's a dictionary, preview some keys
if isinstance(data, dict):
    print(list(data.keys())[:10])  # Print first 10 keys

# If it's a list, preview first few elements
elif isinstance(data, list):
    print(data[:5])  # Print first 5 elements

<class 'tuple'>
2


In [4]:
print(type(data[0]))  # Type of first element
print(type(data[1]))  # Type of second element


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [5]:
df = data[0]  # Extract DataFrame

# Check basic info
print(df.info())

# Show first few rows
display(df.head())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5222904 entries, 0 to 5222903
Columns: 111 entries, review_count to text_feature_99
dtypes: float64(104), int32(1), int64(6)
memory usage: 4.3 GB
None


Unnamed: 0,review_count,price_range,is_open,review_useful,review_funny,review_cool,review_engagement,popularity_score,latitude,longitude,...,text_feature_90,text_feature_91,text_feature_92,text_feature_93,text_feature_94,text_feature_95,text_feature_96,text_feature_97,text_feature_98,text_feature_99
0,169,2.0,1,0,0,0,0,5.135798,40.210196,-75.223639,...,0.163739,0.23642,0.0,0.0,0.0,0.0,0.233233,0.225804,0.0,0.0
1,47,1.0,1,0,0,0,0,3.871201,32.207233,-110.980864,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,181,2.0,1,1,0,1,2,5.204007,40.079848,-75.02508,...,0.0,0.0,0.0,0.398945,0.0,0.0,0.0,0.0,0.0,0.0
3,32,2.0,0,1,0,1,2,3.496508,29.962102,-90.087958,...,0.136585,0.0,0.141275,0.163491,0.0,0.0,0.0,0.0,0.0,0.358466
4,273,2.0,0,1,2,1,4,5.613128,39.938013,-75.148131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338898,0.0


review_count       0
price_range        0
is_open            0
review_useful      0
review_funny       0
                  ..
text_feature_95    0
text_feature_96    0
text_feature_97    0
text_feature_98    0
text_feature_99    0
Length: 111, dtype: int64
