In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#!pip install implicit
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from implicit.als import AlternatingLeastSquares


In [11]:
!pip list >> requirements.txt

In [5]:
full_data_path = "~/code/Alanoudis/food-delivery-rec/data/updated_data/full_data.txt"
full_data = pd.read_csv(full_data_path, sep=',', encoding='utf-8')
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,day_of_week,order_time,order_day,chain_id,vendor_geohash,cuisine_origin,avg_vendor_rating,num_products,total_order_value,products_ordered
0,008ab40ac0,w21z7,9390,8ace9ccb,2,01:52:03,3 days,aece2f12,w21z7,chinese,3.5,1,0.4,Spicy中辣
1,008ce71183,w21zb,7057,f0d84faa,5,18:11:48,34 days,fc3b6153,w21zc,chinese,3.5,2,12.8,"Kway Teow Goreng with Petai & Prawns 虾仁臭豆炒河粉, ..."
2,008ce71183,w21zb,7058,a23e4559,2,19:22:26,31 days,788f82f6,w21zb,american,3.8,4,14.8,"Tuna D'Licious, Spicy Chicken Pizza Baguette, ..."
3,008ce71183,w21zb,7059,a23e4559,3,18:54:16,18 days,788f82f6,w21zb,american,3.6,4,11.6,"Tuna D'Licious, Chocolate Eclair, Tuna D'Licio..."
4,00ba08bab4,w21zt,347,78ce75cb,3,20:39:37,39 days,24975bf7,w21zt,american,4.5,1,3.6,McGriddles Feast


In [6]:
# ---------- Utils: parsing & safe lookups ----------
def _parse_order_day(col: pd.Series) -> pd.Series:
    """
    Convert strings like '34 days' -> 34 (float).
    If already numeric or missing, handle gracefully.
    """
    s = col.astype(str).str.extract(r'(\d+\.?\d*)', expand=False)
    return pd.to_numeric(s, errors='coerce')

def _vendor_to_cuisine(vendor_ids, vendor_features):
    # Map vendor_id -> cuisine_origin, fill 'unknown' when missing
    return vendor_ids.map(vendor_features['cuisine_origin']).fillna('unknown')

# ---------- 1) User history ----------
def get_user_history(df, user_id, n=20, sort_by_recency=True):
    """
    Return last N orders for a user with key columns.
    If `order_day` is 'X days' (days since?), we sort ascending (more recent is smaller number).
    Fallback: sort by order_id if parsing fails.
    """
    sub = df[df['customer_id'] == user_id].copy()
    if sub.empty:
        return sub  # empty DF

    # Try to parse 'order_day'
    sub['order_day_num'] = _parse_order_day(sub['order_day'])
    if sort_by_recency and sub['order_day_num'].notna().any():
        sub = sub.sort_values(by=['order_day_num','order_time'], ascending=[True, True])
    else:
        # fallback ordering
        if 'order_id' in sub.columns:
            sub = sub.sort_values(by='order_id', ascending=False)

    keep_cols = [
        'order_id','order_time','order_day','vendor_id','cuisine_origin',
        'num_products','total_order_value','products_ordered'
    ]
    keep_cols = [c for c in keep_cols if c in sub.columns]
    return sub[keep_cols].head(n)

# ---------- 2) User taste profile (cuisines & vendors) ----------
def get_user_taste(df, user_id, top_k=5):
    """
    Summarize user's cuisine & vendor preferences with simple weighted scores:
    score = orders * (avg_vendor_rating / 5).
    """
    sub = df[df['customer_id'] == user_id]
    if sub.empty:
        return pd.DataFrame(), pd.DataFrame()

    # Cuisine scores
    c = (sub.groupby('cuisine_origin')
            .agg(orders=('order_id','count'),
                 avg_rating=('avg_vendor_rating','mean'),
                 total_value=('total_order_value','sum'))
            .reset_index())
    c['score'] = c['orders'] * (c['avg_rating'] / 5.0)
    top_cuisines = c.sort_values('score', ascending=False).head(top_k)

    # Vendor scores
    v = (sub.groupby('vendor_id')
            .agg(orders=('order_id','count'),
                 avg_rating=('avg_vendor_rating','mean'),
                 total_value=('total_order_value','sum'))
            .reset_index())
    v['score'] = v['orders'] * (v['avg_rating'] / 5.0)
    top_vendors = v.sort_values('score', ascending=False).head(top_k)

    return top_cuisines, top_vendors


In [7]:

# ---------- 3) Show recommendations with cuisines ----------
def enrich_recommendations(recs_df, vendor_features):
    """
    Add cuisine for each recommended vendor and tidy columns.
    """
    recs = recs_df.copy()
    if 'vendor_id' not in recs.columns:
        # if vendor_id is the index
        recs = recs.reset_index().rename(columns={'index':'vendor_id'})
    recs['cuisine_origin'] = _vendor_to_cuisine(recs['vendor_id'], vendor_features)
    cols = [c for c in ['vendor_id','cuisine_origin','hybrid_score','cf_score','cb_score'] if c in recs.columns]
    return recs[cols]

# ---------- 4) Quick alignment checks ----------
def evaluate_alignment(user_id, df, recs_df, vendor_features, top_k_cuisines=3):
    """
    Simple sanity checks:
    - % of top-N recs that belong to user's top-K cuisines
    - % of rec vendors the user has ordered from before (familiarity)
    - Jaccard overlap on cuisines between recs and user history
    """
    # User cuisine taste
    top_cuisines, _ = get_user_taste(df, user_id, top_k=top_k_cuisines)
    user_top_cuisines = set(top_cuisines['cuisine_origin']) if not top_cuisines.empty else set()

    # Recommendation cuisines
    recs_en = enrich_recommendations(recs_df, vendor_features)
    rec_cuisines = recs_en['cuisine_origin'].tolist()
    rec_cuisine_set = set(rec_cuisines) - {'unknown'}

    # Hit rate: how many recs are from user's top cuisines?
    hits = sum(c in user_top_cuisines for c in rec_cuisines)
    hit_rate = hits / len(rec_cuisines) if len(rec_cuisines) else np.nan

    # Familiarity: rec vendors seen before?
    seen_vendors = set(df.loc[df['customer_id']==user_id, 'vendor_id'])
    rec_vendors = set(recs_en['vendor_id'])
    familiarity_rate = len(rec_vendors & seen_vendors) / max(len(rec_vendors), 1)

    # Jaccard on cuisines
    user_all_cuisines = set(df.loc[df['customer_id']==user_id, 'cuisine_origin'].unique())
    jaccard = (len(user_all_cuisines & rec_cuisine_set) /
               max(len(user_all_cuisines | rec_cuisine_set), 1))

    summary = {
        'user_top_cuisines': list(user_top_cuisines),
        'rec_cuisines': rec_cuisines,
        'hit_rate_top_cuisines': round(hit_rate, 3),
        'familiarity_rate_vendors_seen_before': round(familiarity_rate, 3),
        'jaccard_cuisine_overlap': round(jaccard, 3)
    }
    return recs_en, pd.Series(summary, name='alignment')


In [8]:
# --- Build a simple recommendations DataFrame (no CF yet) ---
# Basis: top vendors by order count & rating, excluding vendors the user already used.
user_id = full_data['customer_id'].iloc[0]
seen_vendors = set(full_data.loc[full_data['customer_id'] == user_id, 'vendor_id'])

vendor_stats = (full_data.groupby('vendor_id')
                .agg(order_count=('order_id', 'count'),
                     avg_rating=('avg_vendor_rating', 'mean'))
                .reset_index())

# exclude seen vendors; keep non-null ratings
vendor_stats = vendor_stats[~vendor_stats['vendor_id'].isin(seen_vendors)]
vendor_stats = vendor_stats[vendor_stats['avg_rating'].notna()]

# simple hybrid score = normalized count * 0.6 + normalized rating * 0.4
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
vendor_stats[['order_count_n','avg_rating_n']] = scaler.fit_transform(
    vendor_stats[['order_count','avg_rating']]
)
vendor_stats['hybrid_score'] = 0.6*vendor_stats['order_count_n'] + 0.4*vendor_stats['avg_rating_n']

# keep top-N
recommendations = vendor_stats.sort_values('hybrid_score', ascending=False).head(20)[
    ['vendor_id','hybrid_score']
].copy()
# optional placeholders to satisfy your columns if present
recommendations['cf_score'] = recommendations['hybrid_score']
recommendations['cb_score'] = recommendations['hybrid_score']

# --- Ensure vendor_features is a Series/DataFrame indexed by vendor_id with 'cuisine_origin' ---
# If you already have vendor_features, just reindex it. Otherwise derive from full_data.
if 'vendor_features' not in globals():
    vendor_features = (full_data[['vendor_id','cuisine_origin']]
                       .drop_duplicates()
                       .set_index('vendor_id'))
else:
    if 'vendor_id' in vendor_features.columns:
        vendor_features = vendor_features.set_index('vendor_id')
    # guarantee the column name matches what _vendor_to_cuisine expects
    if 'cuisine_origin' not in vendor_features.columns and 'primary_cuisine' in vendor_features.columns:
        vendor_features = vendor_features.rename(columns={'primary_cuisine': 'cuisine_origin'})

# --- Fix later references: use full_data instead of df ---
# User cuisine distribution (all-time)
user_cuisine_counts = (full_data.loc[full_data['customer_id']==user_id, 'cuisine_origin']
                         .value_counts(normalize=True)
                         .rename('share')
                         .reset_index()
                         .rename(columns={'index':'cuisine_origin'}))

# (Re)compute rec cuisine distribution after recs_en exists
# (run your evaluate_alignment cell after this)


In [9]:
# Choose the same sample user you used above
user_id = '2e7276ad3a'  # or any id, e.g., '008ab40ac0'

print("— User history (latest ~20 orders) —")
display(get_user_history(full_data, user_id, n=20))

print("— User taste profile —")
uc_top, uv_top = get_user_taste(full_data, user_id, top_k=5)
print("Top cuisines:")
display(uc_top)
print("Top vendors:")
display(uv_top)

print("— Recommendations (with cuisines) —")
recs_enriched, align = evaluate_alignment(user_id, full_data, recommendations, vendor_features, top_k_cuisines=3)
display(recs_enriched)
print("\n— Alignment summary —")
display(align)


— User history (latest ~20 orders) —


Unnamed: 0,order_id,order_time,order_day,vendor_id,cuisine_origin,num_products,total_order_value,products_ordered
1717,14,18:28:03,18 days,389d8451,snacks,1,1.2,Coke
1716,12,07:50:41,47 days,921b38c7,snacks,3,11.2,"Latte (Regular), Latte (Large), Le Parisien (H..."
1718,15,18:37:54,54 days,573e52c0,japanese,1,6.8,Original King
1712,8,20:03:46,55 days,3c8b6666,italian,3,24.8,"Fungi Risotto, Caesar, Ginger Ale"
1708,3,19:05:15,59 days,b62d39b7,italian,3,34.8,"Diavola, Cinque Formaggi, Linguine Al Granchio"
1714,10,18:15:00,66 days,31883abc,thai,2,8.4,"Vegetarian Spring Rolls (6pcs), Green Curry (M..."
1711,7,18:15:10,70 days,23c3cbb7,snacks,2,11.2,"Chocolate Pint, Vanilla Pint"
1713,9,19:25:30,75 days,31883abc,thai,2,12.0,"Green Curry (Mild Spicy), Glass Noodle Salad (..."
1709,4,20:13:08,81 days,e33ad7ec,italian,4,38.0,"House Salad, Da Paolo Dolcetto, Valrhona Choc ..."
1710,6,20:40:24,87 days,54a7bf39,american,2,10.4,"Five Guys Shake, Five Guys Shake"


— User taste profile —
Top cuisines:


Unnamed: 0,cuisine_origin,orders,avg_rating,total_value,score
3,snacks,4,4.325,28.4,3.46
1,italian,3,4.233333,97.6,2.54
4,thai,2,4.5,20.4,1.8
2,japanese,1,4.0,6.8,0.8
0,american,1,3.8,10.4,0.76


Top vendors:


Unnamed: 0,vendor_id,orders,avg_rating,total_value,score
2,31883abc,2,4.5,20.4,1.8
1,2a89ea8c,1,4.5,4.8,0.9
3,389d8451,1,4.5,1.2,0.9
8,b62d39b7,1,4.3,34.8,0.86
4,3c8b6666,1,4.3,24.8,0.86


— Recommendations (with cuisines) —


Unnamed: 0,vendor_id,cuisine_origin,hybrid_score,cf_score,cb_score
1206,60257fe4,singaporean,0.92385,0.92385,0.92385
1865,91704ac5,japanese,0.872692,0.872692,0.872692
1784,8bfd740f,snacks,0.742523,0.742523,0.742523
2791,d9c749be,chinese,0.597208,0.597208,0.597208
1214,61081731,chinese,0.570346,0.570346,0.570346
2831,dd15e080,chinese,0.468339,0.468339,0.468339
1861,90cca11f,chinese,0.43474,0.43474,0.43474
675,3613129a,italian,0.414935,0.414935,0.414935
2482,c1698370,thai,0.41039,0.41039,0.41039
1906,94a02467,japanese,0.407143,0.407143,0.407143



— Alignment summary —


user_top_cuisines                                                 [thai, snacks, italian]
rec_cuisines                            [singaporean, japanese, snacks, chinese, chine...
hit_rate_top_cuisines                                                                 0.3
familiarity_rate_vendors_seen_before                                                  0.0
jaccard_cuisine_overlap                                                             0.571
Name: alignment, dtype: object