In [4]:
# ---------- PARAMETERS ----------
RATING_MIN = 3.5            # keep orders with product_rating >= this
ALPHA = 0.50                # CF weight in hybrid (CB weight = 1 - ALPHA)
TOP_K = 10                  # evaluation K
USE_CANDIDATE_FILTER = False  # set True to prune candidates (e.g., geohash / top cuisines)
CANDIDATE_POOL = 400        # CF candidate pool size before blending
DIVERSITY_PENALTY = 0.2     # MMR diversity weight by cuisine (0 = off)
RECENCY_DECAY = 0.0         # e.g., 0.002 -> exp(-0.002 * days_ago); 0.0 disables
SEED = 42


In [5]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.preprocessing import StandardScaler
#!pip install implicit
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.metrics import precision_score, ndcg_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(SEED)

In [6]:
full_data_path = "~/code/Alanoudis/food-delivery-rec/notebooks/alanoud/full_data_complete.csv"
full_data = pd.read_csv(full_data_path)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,...,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6,order_hour,meal_of_day,order_hour_sin,order_hour_cos
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,1900-01-01 12:03:29,85,japanese garlic karaage don,6.0,...,False,False,False,False,False,False,12,lunch,1.224647e-16,-1.0
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,1900-01-01 12:03:29,85,chicken cutlet don,6.8,...,False,False,False,False,False,False,12,lunch,1.224647e-16,-1.0
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,1900-01-01 12:03:29,85,beef sukiyaki don,6.8,...,False,False,False,False,False,False,12,lunch,1.224647e-16,-1.0
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,1900-01-01 12:03:29,85,japanese beef yakiniku don,6.8,...,False,False,False,False,False,False,12,lunch,1.224647e-16,-1.0
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,1900-01-01 12:03:29,85,teriyaki salmon don,8.0,...,False,False,False,False,False,False,12,lunch,1.224647e-16,-1.0


In [7]:
# old 100k data, im looking for products_rating and order_frequency columns
full_data100k_path = "~/code/Alanoudis/food-delivery-rec/data/test-train/full_data100k.csv"
full_data100k = pd.read_csv(full_data100k_path, index_col=0)
full_data100k.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,order_frequency,product_rating
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,japanese garlic karaage don,6.0,66c9978d,w21z7,japanese,1,4
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,chicken cutlet don,6.8,66c9978d,w21z7,japanese,1,5
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,beef sukiyaki don,6.8,66c9978d,w21z7,japanese,1,3
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,japanese beef yakiniku don,6.8,66c9978d,w21z7,japanese,1,5
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,teriyaki salmon don,8.0,66c9978d,w21z7,japanese,1,5


In [8]:
# add products rating to full_data
# One-line solution:
# Insert 'order_frequency' before 'product_rating'
full_data.insert(full_data.columns.get_loc('chain_id'), 'order_frequency', full_data100k['order_frequency'])

# Then insert 'product_rating' right after it
full_data.insert(full_data.columns.get_loc('chain_id'), 'product_rating', full_data100k['product_rating'])

print("✅ Done! order_frequency and product_rating added between unit_price and chain_id")
print(f"full_data now has {full_data.shape[1]} columns")

✅ Done! order_frequency and product_rating added between unit_price and chain_id
full_data now has 27 columns


In [9]:
vendor_ratings_path = "~/code/Alanoudis/food-delivery-rec/notebooks/alanoud/feature-engineering/Alshaimas-rating/vendor_ratings.csv"
full_data2 = pd.read_csv(vendor_ratings_path)
full_data2 = full_data2.rename(columns={'avg_vendor_rating': 'vendor_rating'})
full_data2.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,day_of_week,order_time,order_day,chain_id,vendor_geohash,cuisine_origin,vendor_rating,num_products,total_order_value,products_ordered
0,00119c8178,w21zu,39095,e7cb5902,2,16:30:07,10 days,ef3142e8,w21zu,malaysian,3.5,1,5.6,Seafood Fried Rice
1,00198e01e4,w21z6,35939,02acaff6,0,13:57:35,85 days,8c51e46b,w21z6,chinese,3.9,4,8.0,"Bean Curd Skin 凉拌腐竹, Dried Beancurd 凉拌素鸡, Blac..."
2,001a5689fc,w21z3,48288,a2c60e71,0,20:10:35,78 days,dd69fe77,w21z3,singaporean,4.5,1,2.4,Carrot Cake (Melur)
3,001a5689fc,w21z3,48289,a3bc472c,4,20:50:40,40 days,c59edb7d,w21z9,snacks,4.0,1,1.6,Served without Fries
4,001a5689fc,w21z3,48290,de18b671,3,20:05:42,53 days,f88ffd2b,w21z3,american,4.5,1,4.0,WHOPPER® Meal


In [11]:
# --- 1.1 Clean order_day to integer days
def clean_order_day(df):
    out = df.copy()
    out['order_day'] = (
        out['order_day'].astype(str).str.extract(r'(\d+)')[0].astype(int)
    )
    return out

# --- 1.2 Apply rating policy (keep only rows that can teach positive preference)
def apply_rating_policy(df, min_rating=RATING_MIN):
    return df[df['product_rating'] >= min_rating].copy()

# --- 1.3 Leave-last-one per user (next-item eval)
def leave_last_one(df, user_col='customer_id', time_col='order_day'):
    g = df.sort_values([user_col, time_col]).groupby(user_col, group_keys=False)
    test = g.tail(1)
    train = df.drop(test.index)
    return train, test
# --- ensure vendor_rating exists in _base by merging from full_data2 ---
# 1) align dtypes for join keys to avoid silent mismatches
for _df in (_base, full_data2):
    _df['order_id'] = _df['order_id'].astype(str)
    _df['vendor_id'] = _df['vendor_id'].astype(str)

# 2) merge order-level vendor_rating if available
cols_from_ratings = ['order_id', 'vendor_id', 'vendor_rating']
available = [c for c in cols_from_ratings if c in full_data2.columns]
if {'order_id','vendor_id','vendor_rating'}.issubset(available):
    _base = _base.merge(full_data2[cols_from_ratings], on=['order_id','vendor_id'], how='left')

# 3) if vendor_rating still missing for some rows, compute a Bayesian-smoothed vendor rating from product_rating (>= RATING_MIN)
def bayesian_avg(series, m=10, prior=4.0):
    n = series.count()
    mean = series.mean() if n > 0 else prior
    return (n/(n+m))*mean + (m/(n+m))*prior

# vendor-level fallback map (only from filtered rows that passed your rating policy)
vr_map = (_base.groupby('vendor_id')['product_rating']
               .apply(lambda s: bayesian_avg(s.dropna(), m=10, prior=4.0)))
# create the column if missing
if 'vendor_rating' not in _base.columns:
    _base['vendor_rating'] = np.nan

# fill NaNs with vendor-level Bayesian rating
_base['vendor_rating'] = _base['vendor_rating'].fillna(_base['vendor_id'].map(vr_map))

# final safety: if anything is still NaN, fill with global prior
_base['vendor_rating'] = _base['vendor_rating'].fillna(4.0)

# --- now do your order-level aggregation safely ---
order_level = (_base.groupby('order_id').agg(
    customer_id=('customer_id','first'),
    vendor_id=('vendor_id','first'),
    cuisine_origin=('cuisine_origin','first'),
    order_day=('order_day','first'),
    unit_price=('unit_price','sum'),
    product_rating=('product_rating','mean'),
    vendor_rating=('vendor_rating','mean'),
    customer_geohash=('customer_geohash','first'),
    vendor_geohash=('vendor_geohash','first')
).reset_index())

# split as before
train_orders, test_orders = leave_last_one(order_level, 'customer_id', 'order_day')
print("Train orders:", len(train_orders), " Test orders:", len(test_orders))

# ---------- RUN PREP ----------
_base = full_data.copy()
_base = clean_order_day(_base)
_base = apply_rating_policy(_base, RATING_MIN)

# Collapse to order-level if you haven’t already:
# (If your 'order_level_data' already exists from earlier, you can use that instead of this quick version.)
order_level = (_base.groupby('order_id').agg(
    customer_id=('customer_id','first'),
    vendor_id=('vendor_id','first'),
    cuisine_origin=('cuisine_origin','first'),
    order_day=('order_day','first'),
    unit_price=('unit_price','sum'),
    product_rating=('product_rating','mean'),
    vendor_rating=('vendor_rating','mean'),
    customer_geohash=('customer_geohash','first'),
    vendor_geohash=('vendor_geohash','first')
).reset_index())

train_orders, test_orders = leave_last_one(order_level, 'customer_id', 'order_day')
print("Train orders:", len(train_orders), " Test orders:", len(test_orders))


Train orders: 26497  Test orders: 9125


KeyError: "Column(s) ['vendor_rating'] do not exist"

In [None]:
# --- 2.1 Build user×vendor interaction (train-only)
def build_interactions(df):
    # per (user,vendor): frequency + average rating
    grp = df.groupby(['customer_id','vendor_id']).agg(
        cnt=('order_id','count'),
        pr_mean=('product_rating','mean'),
        vr_mean=('vendor_rating','mean'),
        last_day=('order_day','max')
    ).reset_index()

    # optional recency decay on count (helps drift)
    if RECENCY_DECAY > 0.0:
        max_day = df['order_day'].max()
        age = (max_day - grp['last_day']).clip(lower=0)
        rec_weight = np.exp(-RECENCY_DECAY * age)
    else:
        rec_weight = 1.0

    # implicit strength: log(1+count) * avg product rating scaled, * vendor rating scaled, * recency
    score = np.log1p(grp['cnt']) * (grp['pr_mean']/5.0) * (grp['vr_mean'].fillna(grp['vr_mean'].mean())/5.0) * rec_weight
    grp['score'] = score

    # encode
    usr_cats = pd.Categorical(grp['customer_id'])
    vnd_cats = pd.Categorical(grp['vendor_id'])
    grp['u'] = usr_cats.codes
    grp['i'] = vnd_cats.codes

    # users x items (for recommend()) and items x users (for fit)
    X_ui = coo_matrix((grp['score'], (grp['u'], grp['i'])),
                      shape=(len(usr_cats.categories), len(vnd_cats.categories))).tocsr()

    maps = {
        'user_index_to_id': dict(enumerate(usr_cats.categories)),
        'item_index_to_id': dict(enumerate(vnd_cats.categories)),
    }
    maps['user_id_to_index'] = {v:k for k,v in maps['user_index_to_id'].items()}
    maps['item_id_to_index'] = {v:k for k,v in maps['item_index_to_id'].items()}
    return X_ui, maps, grp

X_ui, maps, uv_raw = build_interactions(train_orders)
print("Users x Items matrix:", X_ui.shape)

# --- 2.2 BM25 weighting & train ALS (items x users for .fit)
Xw = bm25_weight(X_ui).tocsr()
als = AlternatingLeastSquares(factors=64, regularization=0.08, iterations=30, random_state=SEED)
als.fit(Xw.T)  # items x users

# --- 2.3 Build vendor content features (train-only)
def bayesian_avg(series, m=10, prior=4.0):
    n = series.count()
    mean = series.mean() if n > 0 else prior
    return (n/(n+m))*mean + (m/(n+m))*prior

vfeat = (train_orders.groupby('vendor_id').agg(
    cuisine_origin=('cuisine_origin','first'),
    vendor_geohash=('vendor_geohash','first'),
    vendor_rating=('vendor_rating', lambda s: bayesian_avg(s.dropna()))
).reset_index())

# normalize rating
vfeat['vendor_rating_norm'] = (vfeat['vendor_rating'] - vfeat['vendor_rating'].min()) / \
                               max(1e-9, (vfeat['vendor_rating'].max() - vfeat['vendor_rating'].min()))

# cuisine one-hot
C = pd.get_dummies(vfeat['cuisine_origin'])
content = pd.concat([C, vfeat[['vendor_rating_norm']]], axis=1)
content.index = vfeat['vendor_id']

# cosine similarity (vendor x vendor)
vsim = pd.DataFrame(cosine_similarity(content), index=content.index, columns=content.index)

# --- 2.4 Train-only helper maps & sets
seen_by_user = train_orders.groupby('customer_id')['vendor_id'].apply(set).to_dict()
popular_vendors = (train_orders.groupby('vendor_id').size().sort_values(ascending=False).index.tolist())
