In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
df_food_delivery = pd.read_csv("food_delivery_data.csv")
df_food_delivery.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,DeliveryDistanceKM,OrderAmount,DiscountApplied,DeliveryTimeMin
0,8180,436,27-11-2024,17.37,$35.53,0.7,42
1,2138,265,27-11-2024,4.44,$36.58,9.36,52
2,8737,193,31-10-2024,19.66,$79.93,3.38,59
3,1913,769,14-11-2024,1.99,$23.16,1.44,11
4,1506,982,05-01-2025,7.26,$24.95,11.87,36


In [3]:
df_food_delivery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OrderID             500 non-null    int64  
 1   CustomerID          500 non-null    int64  
 2   OrderDate           500 non-null    object 
 3   DeliveryDistanceKM  500 non-null    float64
 4   OrderAmount         500 non-null    object 
 5   DiscountApplied     500 non-null    float64
 6   DeliveryTimeMin     500 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 27.5+ KB


In [4]:
df_food_type = pd.read_csv("food_delivery_data_with_food_type.csv")
df_food_type.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,FoodType
0,3521,386,12-12-2024,Seafood
1,7430,259,02-11-2024,Pasta
2,5392,702,30-12-2024,Sushi
3,4477,344,09-01-2025,Dessert
4,1912,140,25-10-2024,Salad


In [5]:
df_food_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     500 non-null    int64 
 1   CustomerID  500 non-null    int64 
 2   OrderDate   500 non-null    object
 3   FoodType    500 non-null    object
dtypes: int64(2), object(2)
memory usage: 15.8+ KB


In [6]:
df_demography = pd.read_csv("food_delivery_data_with_demographics.csv")
df_demography.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,Age,Gender,City
0,3521,386,12-12-2024,57,Male,CityB
1,7430,259,02-11-2024,68,Female,CityA
2,4477,344,09-01-2025,38,Female,CityC
3,1912,140,25-10-2024,63,Female,CityA
4,4841,406,07-01-2025,68,Other,CityA


In [7]:
df_demography.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     225 non-null    int64 
 1   CustomerID  225 non-null    int64 
 2   OrderDate   225 non-null    object
 3   Age         225 non-null    int64 
 4   Gender      225 non-null    object
 5   City        225 non-null    object
dtypes: int64(3), object(3)
memory usage: 10.7+ KB


In [8]:
print("initial df: ", df_food_delivery.shape)
print("based on food type df: ", df_food_type.shape)
print("based on demography df: ", df_demography.shape)

initial df:  (500, 7)
based on food type df:  (500, 4)
based on demography df:  (225, 6)


Overview and Preprocessing of Data

In [9]:
df_food_delivery['OrderDate'] = pd.to_datetime(df_food_delivery['OrderDate'], format='%d-%m-%Y')
df_food_type['OrderDate'] = pd.to_datetime(df_food_type['OrderDate'], format='%d-%m-%Y')
df_demography['OrderDate'] = pd.to_datetime(df_demography['OrderDate'], format='%d-%m-%Y')

In [10]:
df_food_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   OrderID     500 non-null    int64         
 1   CustomerID  500 non-null    int64         
 2   OrderDate   500 non-null    datetime64[ns]
 3   FoodType    500 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 15.8+ KB


In [11]:
df_food_type.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,FoodType
0,3521,386,2024-12-12,Seafood
1,7430,259,2024-11-02,Pasta
2,5392,702,2024-12-30,Sushi
3,4477,344,2025-01-09,Dessert
4,1912,140,2024-10-25,Salad


In [14]:
#converting order amount to numeric - 
df_food_delivery['OrderAmount'] = df_food_delivery['OrderAmount'].replace('[\\$,]', '', regex=True).astype(float)

In [15]:
df_food_delivery.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,DeliveryDistanceKM,OrderAmount,DiscountApplied,DeliveryTimeMin
0,8180,436,2024-11-27,17.37,35.53,0.7,42
1,2138,265,2024-11-27,4.44,36.58,9.36,52
2,8737,193,2024-10-31,19.66,79.93,3.38,59
3,1913,769,2024-11-14,1.99,23.16,1.44,11
4,1506,982,2025-01-05,7.26,24.95,11.87,36


In [16]:
df_food_delivery.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   OrderID             500 non-null    int64         
 1   CustomerID          500 non-null    int64         
 2   OrderDate           500 non-null    datetime64[ns]
 3   DeliveryDistanceKM  500 non-null    float64       
 4   OrderAmount         500 non-null    float64       
 5   DiscountApplied     500 non-null    float64       
 6   DeliveryTimeMin     500 non-null    int64         
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 27.5 KB


In [20]:
print(df_food_delivery.duplicated().sum()) # to check for duplicates in the dataset
print(df_food_type.duplicated().sum()) # to check for duplicates in the dataset
print(df_demography.duplicated().sum()) # to check for duplicates in the dataset

0
0
0


In [21]:
#from info() we got to know that there are no null values, but further confirmation - 
df_food_delivery.isnull().sum()
df_food_type.isnull().sum()
df_demography.isnull().sum()

OrderID       0
CustomerID    0
OrderDate     0
Age           0
Gender        0
City          0
dtype: int64

In [22]:
# Merge df_food_delivery and df_food_type first
df_merged = pd.merge(
    df_food_delivery, 
    df_food_type, 
    on=['OrderID', 'CustomerID', 'OrderDate'], 
    how='left'  # left join, so we don't lose records from the main dataset
)

# Then merge demography
df_merged = pd.merge(
    df_merged, 
    df_demography, 
    on=['OrderID', 'CustomerID', 'OrderDate'], 
    how='left'
)

df_merged.head()

Unnamed: 0,OrderID,CustomerID,OrderDate,DeliveryDistanceKM,OrderAmount,DiscountApplied,DeliveryTimeMin,FoodType,Age,Gender,City
0,8180,436,2024-11-27,17.37,35.53,0.7,42,,,,
1,2138,265,2024-11-27,4.44,36.58,9.36,52,,,,
2,8737,193,2024-10-31,19.66,79.93,3.38,59,,,,
3,1913,769,2024-11-14,1.99,23.16,1.44,11,,,,
4,1506,982,2025-01-05,7.26,24.95,11.87,36,,,,


In [23]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   OrderID             500 non-null    int64         
 1   CustomerID          500 non-null    int64         
 2   OrderDate           500 non-null    datetime64[ns]
 3   DeliveryDistanceKM  500 non-null    float64       
 4   OrderAmount         500 non-null    float64       
 5   DiscountApplied     500 non-null    float64       
 6   DeliveryTimeMin     500 non-null    int64         
 7   FoodType            0 non-null      object        
 8   Age                 0 non-null      float64       
 9   Gender              0 non-null      object        
 10  City                0 non-null      object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 43.1+ KB


In [24]:
df_merged["FoodType"].isna().sum()

np.int64(500)

In [35]:
set_food_delivery = set(df_food_delivery['OrderID'])
set_food_type = set(df_food_type['OrderID'])

common_orders = set_food_delivery.intersection(set_food_type)
print("Number of common OrderIDs:", len(common_orders))


Number of common OrderIDs: 21


In [28]:
set_food_demography = set(df_demography['OrderID'])
print(len(common_orders.intersection(set_food_demography)))

10


In [32]:
set_customer_delivery = set(df_food_delivery['CustomerID'])
set_customer_type = set(df_food_type['CustomerID'])

common_customers = set_customer_delivery.intersection(set_customer_type)
print("Number of common CustomerIDs:", len(common_customers))


Number of common CustomerIDs: 177


In [29]:
set_food_demography = set(df_demography['CustomerID'])
print(len(common_orders.intersection(set_food_demography)))

0
