In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#pip install implicit
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from implicit.als import AlternatingLeastSquares



In [2]:
big_data_path = "/home/zahra/code/Alanoudis/food-delivery-rec/data/raw_data/orders_sg25k.txt"
big_data = pd.read_csv(big_data_path, sep=',', encoding='utf-8')
big_data.head()

Unnamed: 0.1,Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day
0,0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days
1,1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days
2,2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days
3,3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days
4,4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days


# 1. *Data Cleaning*

In [3]:
big_data.drop(columns=['Unnamed: 0'], inplace=True)

### 1.1 Import products and vendors

In [4]:
file_path2 = "/home/zahra/code/Alanoudis/food-delivery-rec/data/updated_data/products_sg.txt"
products = pd.read_csv(file_path2)
# Drop the unnecessary index column
products.drop(columns=['Unnamed: 0'], inplace=True)
# Multiply price by 1000 to make it more interpretable
unit_price = products['unit_price'] * 1000
products['unit_price'] = unit_price
products.head()

Unnamed: 0,vendor_id,product_id,name,unit_price
0,15bbf316,59099e089514,Meat & Seafood Combo 海鲜套餐,3.6
1,e7b24dc0,52e1017fdbd9,Sausage Egg McMuffin® Meal,2.4
2,e7b24dc0,fcb7110cd932,Scrambled Egg Burger w Chicken Meal,2.4
3,7112a20b,d3e7708c2bc9,Maguro Sushi,1.2
4,6137ef21,bf0d5ac0e03c,Family Meal - Prosp Chic TwLEVMx2 + ChBurgerHM,10.0


In [5]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+\.', '', text)         # Remove leading numbers like "204."
    text = re.sub(r'[^\w\s]', '', text)       # Remove punctuation
    text = re.sub(r'\d+', '', text)           # Remove remaining numbers
    text = text.strip()

In [6]:
# DROP PRODUCT NAME COLUMN
products.drop(columns=['name'], inplace=True)
products.head()

Unnamed: 0,vendor_id,product_id,unit_price
0,15bbf316,59099e089514,3.6
1,e7b24dc0,52e1017fdbd9,2.4
2,e7b24dc0,fcb7110cd932,2.4
3,7112a20b,d3e7708c2bc9,1.2
4,6137ef21,bf0d5ac0e03c,10.0


In [7]:
file_path3 = "~/code/Alanoudis/food-delivery-rec/data/raw_data/vendors_sg.txt"
vendors = pd.read_csv(file_path3)
# Drop the unnecessary index column
vendors.drop(columns=['Unnamed: 0'], inplace=True)
vendors.head()

Unnamed: 0,vendor_id,chain_id,geohash,primary_cuisine
0,b160c319,d2786168,w21z6,mexican
1,9c8f010e,d2786168,w21ze,mexican
2,03eb25e1,5055ab25,w21ze,bak kut teh
3,3613129a,8984acb6,w23b1,italian
4,0946c9e5,1e3a2913,w21z4,bak kut teh


### 1.2 Merge Into One Dataframe

In [8]:
# Step 1: Merge orders with products on vendor_id and product_id
orders_products = big_data.merge(products, on=['vendor_id', 'product_id'], how='left')

# Step 2: Merge the result with vendors on vendor_id
full_data = orders_products.merge(vendors, on='vendor_id', how='left')

# Preview the final merged dataset
full_data.head()

Unnamed: 0,customer_id,geohash_x,order_id,vendor_id,product_id,day_of_week,order_time,order_day,unit_price,chain_id,geohash_y,primary_cuisine
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,6.0,66c9978d,w21z7,ramen
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,8.0,66c9978d,w21z7,ramen


In [9]:
# Rename geohash_x to customer_geohash and geohash_y to vendor_geohash
full_data.rename(columns={'geohash_x': 'customer_geohash', 'geohash_y': 'vendor_geohash'}, inplace=True)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,unit_price,chain_id,vendor_geohash,primary_cuisine
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,6.0,66c9978d,w21z7,ramen
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,6.8,66c9978d,w21z7,ramen
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,8.0,66c9978d,w21z7,ramen


### 1.3 NaN values [will be revisited]

In [10]:
# For now, we will drop the null values
full_data.isnull().sum()

customer_id            0
customer_geohash       0
order_id               0
vendor_id              0
product_id             0
day_of_week            0
order_time             0
order_day              0
unit_price             0
chain_id            3487
vendor_geohash         0
primary_cuisine        0
dtype: int64

In [11]:
full_data = full_data.dropna(subset=['chain_id'])
full_data.shape

(21513, 12)

In [12]:
full_data.isna().sum()

customer_id         0
customer_geohash    0
order_id            0
vendor_id           0
product_id          0
day_of_week         0
order_time          0
order_day           0
unit_price          0
chain_id            0
vendor_geohash      0
primary_cuisine     0
dtype: int64

In [13]:
print(full_data['unit_price'].isna().value_counts())

unit_price
False    21513
Name: count, dtype: int64


In [14]:
rows_with_na = full_data[full_data.isna().any(axis=1)]
print(rows_with_na)


Empty DataFrame
Columns: [customer_id, customer_geohash, order_id, vendor_id, product_id, day_of_week, order_time, order_day, unit_price, chain_id, vendor_geohash, primary_cuisine]
Index: []


In [15]:
# Since it's one row, I'll drop it
full_data = full_data.dropna()
full_data.isna().sum()


customer_id         0
customer_geohash    0
order_id            0
vendor_id           0
product_id          0
day_of_week         0
order_time          0
order_day           0
unit_price          0
chain_id            0
vendor_geohash      0
primary_cuisine     0
dtype: int64

# *2. Feature Engineering*

### 2.1 Categorize Cuisines

In [16]:
global_cuisine_category = {
    "japanese": "japanese",
    "ramen": "japanese",
    "sushi": "japanese",

    "chinese": "chinese",
    "dim sum": "chinese",
    "mala xiang guo": "chinese",
    "mala soups": "chinese",
    "hokkien mee": "chinese",
    "noodles": "chinese",
    "tea": "chinese",

    "indian": "indian",
    "curry": "indian",

    "thai": "thai",

    "vietnamese": "vietnamese",
    "pho": "vietnamese",

    "singaporean": "singaporean",
    "chicken rice": "singaporean",
    "ban mian": "singaporean",
    "yong tau foo": "singaporean",

    "indonesian": "indonesian",
    "ayam penyet": "indonesian",

    "malaysian": "malaysian",
    "nasi lemak": "malaysian",
    "roti prata": "malaysian",

    "american": "american",
    "burgers": "american",
    "sandwiches": "american",
    "grill": "american",
    "fries": "american",
    "fast food": "american",

    "italian": "italian",
    "pizza": "italian",
    "pasta": "italian",

    "french": "french",
    "german": "german",
    "spanish": "spanish",
    "mexican": "mexican",
    "turkish": "turkish",
    "lebanese": "lebanese",

    # Reassign rice-based dishes
    "rice dishes": "asian",
    "fried rice": "asian",
    "porridge": "asian",

    # Everything else defaults to snacks
}


In [17]:
full_data['cuisine_origin'] = full_data['primary_cuisine'].str.strip().map(global_cuisine_category)
full_data['cuisine_origin'].fillna('snacks', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_data['cuisine_origin'].fillna('snacks', inplace=True)


In [18]:
print(full_data[['primary_cuisine', 'cuisine_origin']].drop_duplicates().sort_values(by='primary_cuisine'))

     primary_cuisine cuisine_origin
3347       acai bowl         snacks
66          american       american
51             asian         snacks
203      ayam penyet     indonesian
2915     bak kut teh         snacks
...              ...            ...
92        vegetarian         snacks
1558      vietnamese     vietnamese
142          western         snacks
5098           wraps         snacks
657     yong tau foo    singaporean

[69 rows x 2 columns]


In [19]:
# DROP PRIMARY CUISINE COLUMN
full_data.drop(columns=['primary_cuisine'], inplace=True)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,unit_price,chain_id,vendor_geohash,cuisine_origin
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,6.0,66c9978d,w21z7,japanese
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,8.0,66c9978d,w21z7,japanese


In [20]:
full_data.head(25)

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,unit_price,chain_id,vendor_geohash,cuisine_origin
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,6.0,66c9978d,w21z7,japanese
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,8.0,66c9978d,w21z7,japanese
17,2e7276ad3a,w21z6,3,b62d39b7,60a5baff060c,2,19:05:15,59 days,11.2,67761f45,w21z3,italian
18,2e7276ad3a,w21z6,3,b62d39b7,ad485fc36ebe,2,19:05:15,59 days,11.2,67761f45,w21z3,italian
19,2e7276ad3a,w21z6,3,b62d39b7,d732733afc92,2,19:05:15,59 days,12.4,67761f45,w21z3,italian
20,2e7276ad3a,w21z6,4,e33ad7ec,5bb2e3772724,3,20:13:08,81 days,7.6,bf06469c,w21z6,italian
21,2e7276ad3a,w21z6,4,e33ad7ec,cbe8ace5c352,3,20:13:08,81 days,18.0,bf06469c,w21z6,italian


# *3. Generating Synthetic Data: Products Rating (Out of 5 stars)*

- add rate based on each dish (Product) 

- show order frequency in the app for each customer ,

- added 3 new columns (rating	,order_frequency)

In [21]:
# Set seed for reproducibility (optional)
np.random.seed(42)

# Total number of rows
n_rows =full_data.shape[0]
# Count UNIQUE orders per customer (not rows)
order_counts = full_data.groupby('customer_id')['order_id'].nunique()

# Map order counts back to original dataframe
full_data['order_frequency'] = full_data['customer_id'].map(order_counts)

# Define thresholds based on order frequency quartiles
high_freq_threshold = full_data['order_frequency'].quantile(0.75)
medium_freq_threshold = full_data['order_frequency'].quantile(0.50)


# Initialize rating column
ratings = np.zeros(n_rows)

# Assign ratings based on order frequency
for idx in range(n_rows):
    order_freq = full_data.iloc[idx, full_data.columns.get_loc('order_frequency')]

    if order_freq >= high_freq_threshold:
        # Most frequent orders: mostly 5 stars
        ratings[idx] = np.random.choice([5, 4, 3], p=[0.85, 0.10, 0.05])

    elif order_freq >= medium_freq_threshold:
        # Medium frequency: some lower ratings for 1, 2, 3 stars
        ratings[idx] = np.random.choice([5, 4, 3, 2, 1], p=[0.30, 0.25, 0.25, 0.15, 0.05])

    else:
        # Lower frequency: random distribution
        ratings[idx] = np.random.randint(1, 6)

# Add rating column to dataframe
full_data['product_rating'] = ratings.astype(int)

In [22]:
# Check how many orders each customer has made
order_counts = full_data.groupby('customer_id')['order_id'].nunique()
order_counts

customer_id
008ab40ac0    1
008ce71183    3
00ba08bab4    1
00c41737f8    2
00e9c13b02    1
             ..
ff8baf4dab    2
ffc217c6b1    1
ffd9dd9790    3
ffe455a0e2    1
fff7f1daa4    6
Name: order_id, Length: 2541, dtype: int64

In [23]:
full_data

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,unit_price,chain_id,vendor_geohash,cuisine_origin,order_frequency,product_rating
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,6.0,66c9978d,w21z7,japanese,1,4
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese,1,5
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese,1,3
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,6.8,66c9978d,w21z7,japanese,1,5
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,8.0,66c9978d,w21z7,japanese,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,8c66cd0173,w21z9,12177,334cd438,a93bc91e76ac,4,09:54:23,68 days,3.2,1796fd95,w21z9,singaporean,6,2
24996,8c66cd0173,w21z9,12178,334cd438,a93bc91e76ac,1,17:15:18,16 days,3.2,1796fd95,w21z9,singaporean,6,4
24997,8c66cd0173,w21z9,12179,3777a589,722e9f94c333,5,21:34:23,41 days,0.4,40346c79,w21z9,snacks,6,4
24998,8c66cd0173,w21z9,12180,4343465f,d0fe84ab570e,5,21:41:55,34 days,4.0,f9a1675f,w21z8,snacks,6,3
