In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#!pip install implicit
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from implicit.als import AlternatingLeastSquares

import re

In [3]:
big_data_path = "../../data/raw_data/orders_sg100k.txt"
big_data = pd.read_csv(big_data_path, sep=',', encoding='utf-8')
big_data.head()

Unnamed: 0.1,Unnamed: 0,customer_id,geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day
0,0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days
1,1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days
2,2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days
3,3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days
4,4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days


# 1. *Data Cleaning*

In [4]:
big_data.drop(columns=['Unnamed: 0'], inplace=True)

### 1.1 Import products and vendors

In [5]:
file_path2 = "../../data/raw_data/products_sg.txt"
products = pd.read_csv(file_path2)
# Drop the unnecessary index column
products.drop(columns=['Unnamed: 0'], inplace=True)
# Multiply price by 1000 to make it more interpretable
unit_price = products['unit_price'] * 1000
products['unit_price'] = unit_price
products.head()

Unnamed: 0,vendor_id,product_id,name,unit_price
0,15bbf316,59099e089514,Meat & Seafood Combo 海鲜套餐,3.6
1,e7b24dc0,52e1017fdbd9,Sausage Egg McMuffin® Meal,2.4
2,e7b24dc0,fcb7110cd932,Scrambled Egg Burger w Chicken Meal,2.4
3,7112a20b,d3e7708c2bc9,Maguro Sushi,1.2
4,6137ef21,bf0d5ac0e03c,Family Meal - Prosp Chic TwLEVMx2 + ChBurgerHM,10.0


In [6]:
file_path3 = "../../data/raw_data/vendors_sg.txt"
vendors = pd.read_csv(file_path3)
# Drop the unnecessary index column
vendors.drop(columns=['Unnamed: 0'], inplace=True)
vendors.head()

Unnamed: 0,vendor_id,chain_id,geohash,primary_cuisine
0,b160c319,d2786168,w21z6,mexican
1,9c8f010e,d2786168,w21ze,mexican
2,03eb25e1,5055ab25,w21ze,bak kut teh
3,3613129a,8984acb6,w23b1,italian
4,0946c9e5,1e3a2913,w21z4,bak kut teh


### 1.2 Merge Into One Dataframe

In [7]:
# Step 1: Merge orders with products on vendor_id and product_id
orders_products = big_data.merge(products, on=['vendor_id', 'product_id'], how='left')

# Step 2: Merge the result with vendors on vendor_id
full_data = orders_products.merge(vendors, on='vendor_id', how='left')

# Preview the final merged dataset
full_data.head()

Unnamed: 0,customer_id,geohash_x,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,geohash_y,primary_cuisine
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,201. Japanese Garlic Karaage Don,6.0,66c9978d,w21z7,ramen
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,204. Chicken Cutlet Don,6.8,66c9978d,w21z7,ramen
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,206. Beef Sukiyaki Don,6.8,66c9978d,w21z7,ramen
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,209. Japanese Beef Yakiniku Don,6.8,66c9978d,w21z7,ramen
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,210. Teriyaki Salmon Don,8.0,66c9978d,w21z7,ramen


In [8]:
# Rename geohash_x to customer_geohash and geohash_y to vendor_geohash
full_data.rename(columns={'geohash_x': 'customer_geohash', 'geohash_y': 'vendor_geohash'}, inplace=True)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,primary_cuisine
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,201. Japanese Garlic Karaage Don,6.0,66c9978d,w21z7,ramen
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,204. Chicken Cutlet Don,6.8,66c9978d,w21z7,ramen
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,206. Beef Sukiyaki Don,6.8,66c9978d,w21z7,ramen
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,209. Japanese Beef Yakiniku Don,6.8,66c9978d,w21z7,ramen
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,210. Teriyaki Salmon Don,8.0,66c9978d,w21z7,ramen


### 1.3 NaN values [will be revisited]

In [9]:
# For now, we will drop the null values
full_data.isnull().sum()

customer_id             0
customer_geohash        0
order_id                0
vendor_id               0
product_id              0
day_of_week             0
order_time              0
order_day               0
name                    1
unit_price              1
chain_id            14827
vendor_geohash          0
primary_cuisine         0
dtype: int64

In [10]:
n_nan = full_data['chain_id'].isna().sum()
ratio_nan = n_nan / len(full_data)
print('NaN count:', n_nan, ' | ratio:', f'{ratio_nan:.2%}')

# عيّنة سريعة للمراجعة
display(full_data[full_data['chain_id'].isna()].head(10))


NaN count: 14827  | ratio: 14.83%


Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,primary_cuisine
5,f374c8c54c,w21zt,1,21830106,f245bdf79350,4,19:03:03,61 days,Chicken Cutlet with Rice,2.0,,w21zt,western
6,f374c8c54c,w21zt,1,21830106,146127be77d4,4,19:03:03,61 days,Chicken Chop,2.8,,w21zt,western
7,f374c8c54c,w21zt,1,21830106,6c108c0fb2b9,4,19:03:03,61 days,Chicken Wings with Rice,1.6,,w21zt,western
8,f374c8c54c,w21zt,1,21830106,da422c7836e1,4,19:03:03,61 days,Saba Fish with Rice,2.4,,w21zt,western
9,f374c8c54c,w21zt,1,21830106,01110c80a0fa,4,19:03:03,61 days,Chicken Chop with Rice,2.0,,w21zt,western
10,f374c8c54c,w21zt,1,21830106,5289a6f50a7b,4,19:03:03,61 days,Tomato Sauce Spaghetti,2.0,,w21zt,western
11,f374c8c54c,w21zt,1,21830106,64a268be5f2e,4,19:03:03,61 days,Grilled Fish,3.6,,w21zt,western
12,f374c8c54c,w21zt,1,21830106,adf53ab3befe,4,19:03:03,61 days,Fish & Chips,2.8,,w21zt,western
13,2e7276ad3a,w21z6,2,ee4f2ee0,2c349859aa69,0,18:08:42,8 days,V7. Beef Noodle Soup,4.0,,w21z7,vietnamese
14,2e7276ad3a,w21z6,2,ee4f2ee0,a3e4aac0520d,0,18:08:42,8 days,V12. Summer Rolls,4.0,,w21z7,vietnamese


In [12]:
# ننسخ البيانات عشان نشتغل عليها
df = full_data.copy()

# (أ) نبني mapping من vendor_id -> chain_id المعروف (نأخذ الـ mode في حال التكرار)
known = df.dropna(subset=['chain_id'])
vendor_to_chain = (
    known.groupby('vendor_id')['chain_id']
         .agg(lambda s: s.mode().iloc[0])  # الأكثر تكرارًا لضبط أي ضجيج
)

# (ب) نملأ القيم الناقصة من هذا الـ mapping
df['chain_id_filled'] = df['chain_id'].copy()
df['chain_id_filled'] = df['chain_id_filled'].fillna(df['vendor_id'].map(vendor_to_chain))

# (ج) ما بقي NaN بعد الخطوة السابقة نعتبره stand-alone -> chain_id = vendor_id
solo_mask = df['chain_id_filled'].isna()
df.loc[solo_mask, 'chain_id_filled'] = df.loc[solo_mask, 'vendor_id']
df['standalone_assumption'] = solo_mask  # عشان نعرف مين اتعبّى بهذه الفرضية


In [13]:
# يجب أن يكون لكل vendor_id قيمة chain_id واحدة بعد التعبئة
per_vendor_unique = df.groupby('vendor_id')['chain_id_filled'].nunique()
violations = per_vendor_unique[per_vendor_unique > 1]
print('Vendors with >1 chain_id after filling:', len(violations))

# إحصائية بسيطة: كم سلسلة لها أكثر من فرع، وكم stand-alone
branches_per_chain = df.groupby('chain_id_filled')['vendor_id'].nunique()
print('Chains with multiple branches:', (branches_per_chain > 1).sum())
print('Stand-alone vendors (filled as vendor_id):', df['standalone_assumption'].sum())


Vendors with >1 chain_id after filling: 0
Chains with multiple branches: 670
Stand-alone vendors (filled as vendor_id): 14827


In [14]:
full_data['chain_id'] = df['chain_id_filled']

In [15]:
full_data.isna().sum()

customer_id         0
customer_geohash    0
order_id            0
vendor_id           0
product_id          0
day_of_week         0
order_time          0
order_day           0
name                1
unit_price          1
chain_id            0
vendor_geohash      0
primary_cuisine     0
dtype: int64

In [16]:
print(full_data['unit_price'].isna().value_counts())

unit_price
False    100000
True          1
Name: count, dtype: int64


In [17]:
rows_with_na = full_data[full_data.isna().any(axis=1)]
print(rows_with_na)


      customer_id customer_geohash  order_id vendor_id    product_id  \
27540  f869fc7f2d            w21zt     13407  a06f1cbf  1cc4fc0888d2   

       day_of_week order_time order_day name  unit_price  chain_id  \
27540            1   11:56:42   44 days  NaN         NaN  f24465fd   

      vendor_geohash primary_cuisine  
27540          w21zt         chinese  


In [18]:
# Since it's one row, I'll drop it
full_data = full_data.dropna()
full_data.isna().sum()


customer_id         0
customer_geohash    0
order_id            0
vendor_id           0
product_id          0
day_of_week         0
order_time          0
order_day           0
name                0
unit_price          0
chain_id            0
vendor_geohash      0
primary_cuisine     0
dtype: int64

# *2. Feature Engineering*

### 2.1 Categorize Cuisines

In [19]:
global_cuisine_category = {
    "japanese": "japanese",
    "ramen": "japanese",
    "sushi": "japanese",

    "chinese": "chinese",
    "dim sum": "chinese",
    "mala xiang guo": "chinese",
    "mala soups": "chinese",
    "hokkien mee": "chinese",
    "noodles": "chinese",
    "tea": "chinese",

    "indian": "indian",
    "curry": "indian",

    "thai": "thai",

    "vietnamese": "vietnamese",
    "pho": "vietnamese",

    "singaporean": "singaporean",
    "chicken rice": "singaporean",
    "ban mian": "singaporean",
    "yong tau foo": "singaporean",

    "indonesian": "indonesian",
    "ayam penyet": "indonesian",

    "malaysian": "malaysian",
    "nasi lemak": "malaysian",
    "roti prata": "malaysian",

    "american": "american",
    "burgers": "american",
    "sandwiches": "american",
    "grill": "american",
    "fries": "american",
    "fast food": "american",

    "italian": "italian",
    "pizza": "italian",
    "pasta": "italian",

    "french": "french",
    "german": "german",
    "spanish": "spanish",
    "mexican": "mexican",
    "turkish": "turkish",
    "lebanese": "lebanese",

    # Reassign rice-based dishes
    "rice dishes": "asian",
    "fried rice": "asian",
    "porridge": "asian",

    # Everything else defaults to snacks
}


In [20]:
full_data['cuisine_origin'] = full_data['primary_cuisine'].str.strip().map(global_cuisine_category)
full_data['cuisine_origin'].fillna('snacks', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  full_data['cuisine_origin'].fillna('snacks', inplace=True)


In [21]:
print(full_data[['primary_cuisine', 'cuisine_origin']].drop_duplicates().sort_values(by='primary_cuisine'))

     primary_cuisine cuisine_origin
3347       acai bowl         snacks
66          american       american
51             asian         snacks
170      ayam penyet     indonesian
2915     bak kut teh         snacks
...              ...            ...
92        vegetarian         snacks
13        vietnamese     vietnamese
5            western         snacks
5098           wraps         snacks
657     yong tau foo    singaporean

[77 rows x 2 columns]


In [22]:
# DROP PRIMARY CUISINE COLUMN
full_data.drop(columns=['primary_cuisine'], inplace=True)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,201. Japanese Garlic Karaage Don,6.0,66c9978d,w21z7,japanese
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,204. Chicken Cutlet Don,6.8,66c9978d,w21z7,japanese
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,206. Beef Sukiyaki Don,6.8,66c9978d,w21z7,japanese
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,209. Japanese Beef Yakiniku Don,6.8,66c9978d,w21z7,japanese
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,210. Teriyaki Salmon Don,8.0,66c9978d,w21z7,japanese


In [23]:
full_data.head(25)

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,201. Japanese Garlic Karaage Don,6.0,66c9978d,w21z7,japanese
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,204. Chicken Cutlet Don,6.8,66c9978d,w21z7,japanese
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,206. Beef Sukiyaki Don,6.8,66c9978d,w21z7,japanese
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,209. Japanese Beef Yakiniku Don,6.8,66c9978d,w21z7,japanese
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,210. Teriyaki Salmon Don,8.0,66c9978d,w21z7,japanese
5,f374c8c54c,w21zt,1,21830106,f245bdf79350,4,19:03:03,61 days,Chicken Cutlet with Rice,2.0,21830106,w21zt,snacks
6,f374c8c54c,w21zt,1,21830106,146127be77d4,4,19:03:03,61 days,Chicken Chop,2.8,21830106,w21zt,snacks
7,f374c8c54c,w21zt,1,21830106,6c108c0fb2b9,4,19:03:03,61 days,Chicken Wings with Rice,1.6,21830106,w21zt,snacks
8,f374c8c54c,w21zt,1,21830106,da422c7836e1,4,19:03:03,61 days,Saba Fish with Rice,2.4,21830106,w21zt,snacks
9,f374c8c54c,w21zt,1,21830106,01110c80a0fa,4,19:03:03,61 days,Chicken Chop with Rice,2.0,21830106,w21zt,snacks


### 2.2 Dish Name Cleaning

In [24]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+\.', '', text)         # Remove leading numbers like "204."
    text = re.sub(r'[^\w\s]', '', text)       # Remove punctuation
    text = re.sub(r'\d+', '', text)           # Remove remaining numbers
    text = text.strip()
    return text

full_data['name_cleaned'] = full_data['name'].fillna('').apply(clean_text)


In [25]:
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,name_cleaned
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,201. Japanese Garlic Karaage Don,6.0,66c9978d,w21z7,japanese,japanese garlic karaage don
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,204. Chicken Cutlet Don,6.8,66c9978d,w21z7,japanese,chicken cutlet don
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,206. Beef Sukiyaki Don,6.8,66c9978d,w21z7,japanese,beef sukiyaki don
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,209. Japanese Beef Yakiniku Don,6.8,66c9978d,w21z7,japanese,japanese beef yakiniku don
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,210. Teriyaki Salmon Don,8.0,66c9978d,w21z7,japanese,teriyaki salmon don


In [26]:
full_data['name'] = full_data['name_cleaned']
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,name_cleaned
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,japanese garlic karaage don,6.0,66c9978d,w21z7,japanese,japanese garlic karaage don
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,chicken cutlet don,6.8,66c9978d,w21z7,japanese,chicken cutlet don
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,beef sukiyaki don,6.8,66c9978d,w21z7,japanese,beef sukiyaki don
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,japanese beef yakiniku don,6.8,66c9978d,w21z7,japanese,japanese beef yakiniku don
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,teriyaki salmon don,8.0,66c9978d,w21z7,japanese,teriyaki salmon don


In [27]:
# DROP PRODUCT NAME COLUMN
full_data.drop(columns=['name_cleaned'], inplace=True)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,japanese garlic karaage don,6.0,66c9978d,w21z7,japanese
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,chicken cutlet don,6.8,66c9978d,w21z7,japanese
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,beef sukiyaki don,6.8,66c9978d,w21z7,japanese
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,japanese beef yakiniku don,6.8,66c9978d,w21z7,japanese
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,teriyaki salmon don,8.0,66c9978d,w21z7,japanese


# *3. Generating Synthetic Data: Products Rating (Out of 5 stars)*

- add rate based on each dish (Product) 

- show order frequency in the app for each customer ,

- added 3 new columns (rating	,order_frequency)

In [28]:
# Set seed for reproducibility (optional)
np.random.seed(42)

# Total number of rows
n_rows =full_data.shape[0]
# Count UNIQUE orders per customer (not rows)
order_counts = full_data.groupby('customer_id')['order_id'].nunique()

# Map order counts back to original dataframe
full_data['order_frequency'] = full_data['customer_id'].map(order_counts)

# Define thresholds based on order frequency quartiles
high_freq_threshold = full_data['order_frequency'].quantile(0.75)
medium_freq_threshold = full_data['order_frequency'].quantile(0.50)


# Initialize rating column
ratings = np.zeros(n_rows)

# Assign ratings based on order frequency
for idx in range(n_rows):
    order_freq = full_data.iloc[idx, full_data.columns.get_loc('order_frequency')]

    if order_freq >= high_freq_threshold:
        # Most frequent orders: mostly 5 stars
        ratings[idx] = np.random.choice([5, 4, 3], p=[0.85, 0.10, 0.05])

    elif order_freq >= medium_freq_threshold:
        # Medium frequency: some lower ratings for 1, 2, 3 stars
        ratings[idx] = np.random.choice([5, 4, 3, 2, 1], p=[0.30, 0.25, 0.25, 0.15, 0.05])

    else:
        # Lower frequency: random distribution
        ratings[idx] = np.random.randint(1, 6)

# Add rating column to dataframe
full_data['product_rating'] = ratings.astype(int)

In [34]:
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,order_frequency,product_rating
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,japanese garlic karaage don,6.0,66c9978d,w21z7,japanese,1,4
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,chicken cutlet don,6.8,66c9978d,w21z7,japanese,1,5
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,beef sukiyaki don,6.8,66c9978d,w21z7,japanese,1,3
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,japanese beef yakiniku don,6.8,66c9978d,w21z7,japanese,1,5
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,teriyaki salmon don,8.0,66c9978d,w21z7,japanese,1,5


In [29]:
# Check how many orders each customer has made
order_counts = full_data.groupby('customer_id')['order_id'].nunique()
order_counts.head(25)

customer_id
00119c8178     1
00198e01e4     1
001a5689fc    36
0021dbb4c6     4
002f7ec570     2
002fe54ca1     3
0030b52084    22
003cc13317     4
0045c705e9    32
004e0fd447     4
0051933732     5
005ddefd24     1
006109a61b     1
00708980c0     1
0073ed74f4     2
007b0b2ec2     1
00815dd2fe     5
008ab40ac0     1
008cc4b494     1
008ce71183     5
008da08581     2
0094dd84a6     1
00a412ebfc     1
00a9d8f56e     1
00ba08bab4     1
Name: order_id, dtype: int64

In [30]:
order_counts[order_counts >= 10]

customer_id
001a5689fc    36
0030b52084    22
0045c705e9    32
00eb815b37    10
00ef3009b8    12
              ..
fee62b2c10    16
fef4c15ffa    13
ff097b3d73    11
ff29e59b1b    10
ffa2807677    11
Name: order_id, Length: 1211, dtype: int64

In [31]:
full_data.loc[full_data['customer_id'] == '2e7276ad3a']

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,order_frequency,product_rating
13,2e7276ad3a,w21z6,2,ee4f2ee0,2c349859aa69,0,18:08:42,8 days,v beef noodle soup,4.0,ee4f2ee0,w21z7,vietnamese,15,1
14,2e7276ad3a,w21z6,2,ee4f2ee0,a3e4aac0520d,0,18:08:42,8 days,v summer rolls,4.0,ee4f2ee0,w21z7,vietnamese,15,2
15,2e7276ad3a,w21z6,2,ee4f2ee0,b47b2a83502a,0,18:08:42,8 days,v spring rolls,4.0,ee4f2ee0,w21z7,vietnamese,15,5
16,2e7276ad3a,w21z6,2,ee4f2ee0,4e1d649b81a5,0,18:08:42,8 days,v vietnamese pork belly,4.4,ee4f2ee0,w21z7,vietnamese,15,5
17,2e7276ad3a,w21z6,3,b62d39b7,60a5baff060c,2,19:05:15,59 days,diavola,11.2,67761f45,w21z3,italian,15,5
18,2e7276ad3a,w21z6,3,b62d39b7,ad485fc36ebe,2,19:05:15,59 days,cinque formaggi,11.2,67761f45,w21z3,italian,15,4
19,2e7276ad3a,w21z6,3,b62d39b7,d732733afc92,2,19:05:15,59 days,linguine al granchio,12.4,67761f45,w21z3,italian,15,4
20,2e7276ad3a,w21z6,4,e33ad7ec,5bb2e3772724,3,20:13:08,81 days,house salad,7.6,bf06469c,w21z6,italian,15,4
21,2e7276ad3a,w21z6,4,e33ad7ec,cbe8ace5c352,3,20:13:08,81 days,da paolo dolcetto,18.0,bf06469c,w21z6,italian,15,5
22,2e7276ad3a,w21z6,4,e33ad7ec,ff6a45fa52e6,3,20:13:08,81 days,valrhona choc cake,5.6,bf06469c,w21z6,italian,15,3


OSError: Cannot save file into a non-existent directory: '../../data/train-data'