# Extract raw Food_delivery dataset from kaggle and store it as delta table in landing schema

In [0]:
%pip install kaggle

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 82.7/82.7 kB 2.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting certifi>=2023.7.22
  Downloading certifi-2025.1.31-py3-none-any.whl (166 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.4/166.4 kB 5.6 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 kB 4.7 MB/s eta 0:00:00
Collecting python-slugify
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.2/78.2 kB 6.6 MB/s eta 0:00:00
Building wheels for collected packages: kaggle
  Building wheel f

Generate Kaggle API tokens from Kaggle account and upload the JSON file which contains authentication credentials to databricks under "dbfs:/FileStore/tables/"

In [0]:
# create directories to store raw food-delivery data
dbutils.fs.mkdirs('dbfs:/food_delivery/raw')

True

In [0]:
import os

# path of Kaggle token(JSON file) 
kaggle_token_path = "dbfs:/FileStore/tables/kaggle_token/kaggle.json"

# To read credentials from json
kaggle_token_df = spark.read.format('json').option('header','true').option('inferschema','true').load(kaggle_token_path)

# extract the value from the first row of username  & key column
KAGGLE_USERNAME = kaggle_token_df.select(kaggle_token_df.username).take(1)[0]['username']
KAGGLE_KEY = kaggle_token_df.select(kaggle_token_df.key).take(1)[0]['key']

#set credetails as environment variables (or store kaggle.json under '/root/.kaggle/')
os.environ['KAGGLE_USERNAME'] = KAGGLE_USERNAME
os.environ['KAGGLE_KEY'] = KAGGLE_KEY

In [0]:
from kaggle.api.kaggle_api_extended import KaggleApi

# To authenticate kaggle
def authenticate_kaggle(KAGGLE_USERNAME, KAGGLE_KEY):
  api = KaggleApi()
  api.authenticate()
  print("Successfully authenticated Kaggle !!")
  return api

# To download datasets from kaggle api
def download_dataset(KAGGLE_USERNAME, KAGGLE_KEY, dataset_name):
    # Authenticate kaggle
    api = authenticate_kaggle(KAGGLE_USERNAME, KAGGLE_KEY)

    #download dataset - this downloads dataset to "file:/databricks/driver/"
    api.dataset_download_files(dataset_name,unzip=True)

    # move dataset to dbfs under "raw" folder
    dbutils.fs.mv('file:/databricks/driver/train.csv', 'dbfs:/food_delivery/raw/Food_Delivery_dataset.csv')
    print(dataset_name.split('/')[1]+" kaggle Dataset downloaded !!")


dataset_name = "gauravmalik26/food-delivery-dataset"
# Download datasets from api
download_dataset(KAGGLE_USERNAME, KAGGLE_KEY, dataset_name)


Successfully authenticated Kaggle !!
Dataset URL: https://www.kaggle.com/datasets/gauravmalik26/food-delivery-dataset
food-delivery-dataset kaggle Dataset downloaded !!


Now Let's create a 'Landing' schema(database) under default metastore i.e., hive_metastore inorder to store raw data as delta table

In [0]:
%sql

CREATE DATABASE IF NOT EXISTS landing;

In [0]:
raw_file_path = "dbfs:/food_delivery/raw/Food_Delivery_dataset.csv"
# Read the raw data into a DataFrame
df_raw = spark.read.option("header", "true").csv(raw_file_path, inferSchema=True)

# Rename column names, containing invalid characters
updated_raw_df = df_raw.withColumnRenamed("Time_taken(min)","Time_taken_min")

# Save the raw data as a Delta table in the landing layer
updated_raw_df.write.format("delta").mode("overwrite").saveAsTable("landing.food_delivery_data")

# view the raw data
spark.sql("SELECT * FROM landing.food_delivery_data LIMIT 10").show()

+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-------------------+--------------------+--------------------+-----------------+-------------+---------------+-------------------+--------+--------------+--------------+
|     ID|Delivery_person_ID|Delivery_person_Age|Delivery_person_Ratings|Restaurant_latitude|Restaurant_longitude|Delivery_location_latitude|Delivery_location_longitude|Order_Date|Time_Orderd|  Time_Order_picked|   Weatherconditions|Road_traffic_density|Vehicle_condition|Type_of_order|Type_of_vehicle|multiple_deliveries|Festival|          City|Time_taken_min|
+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-------------------+--------------------+--------------------+-----------------+-------