## Loading Data

In [None]:
# Read parquet (last run for training)
df = spark.read.parquet("Files/uploads/fhvhv_tripdata_2025-06.parquet")
display(df)

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b4741335-3aff-4343-9606-79623b4550b7)

In [None]:
# Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set float display to avoid scientific notation
pd.set_option('display.float_format', '{:.2f}'.format)

# Wranings 
import warnings
warnings.filterwarnings('ignore')

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 4, Finished, Available, Finished)

In [None]:
# df type 
type(df)

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 18, Finished, Available, Finished)

pyspark.sql.dataframe.DataFrame

In [None]:
# New df, drop columns
df = df.drop('dispatching_base_num','originating_base_num','request_datetime',
 'on_scene_datetime','dropoff_datetime','PULocationID','DOLocationID')

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 19, Finished, Available, Finished)

In [None]:
# Columns 
df.columns

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 20, Finished, Available, Finished)

['hvfhs_license_num',
 'pickup_datetime',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'shared_request_flag',
 'shared_match_flag',
 'access_a_ride_flag',
 'wav_request_flag',
 'wav_match_flag',
 'cbd_congestion_fee']

In [None]:
# Info
df.printSchema()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 21, Finished, Available, Finished)

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nullable = true)
 |-- wav_request_flag: string (nullable = true)
 |-- wav_match_flag: string (nullable = true)
 |-- cbd_congestion_fee: double (nullable = true)



In [None]:
# Shape of spark df
num_rows = df.count()
num_cols = len(df.columns)

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 22, Finished, Available, Finished)

Number of rows: 19868009
Number of columns: 18


### Partitioning

In [None]:
# Spark df to Pandas df 
df = df.toPandas()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 24, Finished, Available, Finished)

In [None]:
# Info 
df.info()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 25, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19868009 entries, 0 to 19868008
Data columns (total 18 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   pickup_datetime       datetime64[ns]
 2   trip_miles            float64       
 3   trip_time             int64         
 4   base_passenger_fare   float64       
 5   tolls                 float64       
 6   bcf                   float64       
 7   sales_tax             float64       
 8   congestion_surcharge  float64       
 9   airport_fee           float64       
 10  tips                  float64       
 11  driver_pay            float64       
 12  shared_request_flag   object        
 13  shared_match_flag     object        
 14  access_a_ride_flag    object        
 15  wav_request_flag      object        
 16  wav_match_flag        object        
 17  cbd_congestion_fee    float64       
dtypes: datetime64[ns](1), float64(10), int64

In [None]:
# Add week number
df['week_number'] = df['pickup_datetime'].dt.isocalendar().week

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 26, Finished, Available, Finished)

In [None]:
# Value counts, total rows by week number
df.week_number.value_counts()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 27, Finished, Available, Finished)

week_number
26    4771438
24    4660595
25    4633879
23    4576158
22     650806
27     575133
Name: count, dtype: Int64

#### Partitioning by Weeks
Write to parquet in raw folder of lakehouse
Run only for predictions, not for training data.

In [None]:
import os
   
# base path for Lakehouse
lakehouse_path = "/lakehouse/default/Files/raw/"
    
# unique week numbers to create partitions.
unique_weeks = df['week_number'].unique()
    
print(f"\nPartitioning data by week number. Found {len(unique_weeks)} unique weeks.")

for week in unique_weeks:
        # Filter the DataFrame for the current week.
        partition_df = df[df['week_number'] == week]
        
        # Create the sub-folder path in the format "week_number=value".
        partition_folder = os.path.join(lakehouse_path, f"week_number={week}")
        
        # Ensure the directory exists before saving.
        os.makedirs(partition_folder, exist_ok=True)
        
        # Define the file path within the new folder.
        file_path = os.path.join(partition_folder, "data.parquet")
        
        # Save the filtered DataFrame to a Parquet file.
        partition_df.to_parquet(file_path, index=False)
        print(f"Saved data for week {week} to {file_path}")

#except Exception as e:
#print(f"An error occurred: {e}")

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 15, Finished, Available, Finished)


Partitioning data by week number. Found 5 unique weeks.
Saved data for week 27 to /lakehouse/default/Files/raw/week_number=27/data.parquet
Saved data for week 28 to /lakehouse/default/Files/raw/week_number=28/data.parquet
Saved data for week 29 to /lakehouse/default/Files/raw/week_number=29/data.parquet
Saved data for week 30 to /lakehouse/default/Files/raw/week_number=30/data.parquet
Saved data for week 31 to /lakehouse/default/Files/raw/week_number=31/data.parquet


In [None]:
# Shape 
df.shape

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 16, Finished, Available, Finished)

(19653012, 19)

### Training data

#### Data Quality

In [None]:
# Check nulls
df.isnull().sum()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 28, Finished, Available, Finished)

hvfhs_license_num       0
pickup_datetime         0
trip_miles              0
trip_time               0
base_passenger_fare     0
tolls                   0
bcf                     0
sales_tax               0
congestion_surcharge    0
airport_fee             0
tips                    0
driver_pay              0
shared_request_flag     0
shared_match_flag       0
access_a_ride_flag      0
wav_request_flag        0
wav_match_flag          0
cbd_congestion_fee      0
week_number             0
dtype: int64

In [None]:
# Show Negative fares 
df[df['base_passenger_fare']<0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 29, Finished, Available, Finished)

Unnamed: 0,hvfhs_license_num,pickup_datetime,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,cbd_congestion_fee,week_number
17221,HV0003,2025-06-01 00:17:56,1.60,426,-9.17,0.00,0.67,2.38,0.00,0.00,0.00,0.00,Y,Y,N,N,N,0.00,22
86511,HV0003,2025-06-01 02:37:17,1.34,666,-11.46,0.00,0.80,2.82,0.00,0.00,0.00,0.00,Y,Y,N,N,N,0.00,22
183175,HV0003,2025-06-01 08:59:51,1.14,474,-1.01,0.00,0.22,0.79,0.00,0.00,0.00,6.62,N,N,N,N,N,0.00,22
261931,HV0003,2025-06-01 11:53:06,1.74,461,-1.11,0.00,0.24,0.87,0.00,0.00,0.00,8.43,N,N,N,N,N,0.00,22
286013,HV0003,2025-06-01 12:05:29,1.07,449,-13.76,0.00,0.80,2.86,0.00,0.00,0.00,0.72,Y,Y,N,N,N,0.00,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19724209,HV0005,2025-06-30 18:18:51,15.01,9975,-15.38,0.00,5.47,19.40,2.75,0.00,0.00,122.27,N,N,N,N,N,1.50,27
19736598,HV0003,2025-06-30 18:36:56,2.07,518,-1.32,0.00,0.29,1.03,0.00,0.00,0.00,9.29,N,N,N,N,Y,0.00,27
19798039,HV0003,2025-06-30 21:42:25,1.48,600,-0.48,0.00,0.31,1.09,0.00,0.00,0.00,0.00,Y,Y,N,N,N,0.00,27
19809642,HV0003,2025-06-30 21:50:35,1.41,531,-1.22,0.00,0.27,0.95,0.00,0.00,0.00,7.35,N,N,N,N,N,0.00,27


In [None]:
# Describe
df[df['base_passenger_fare']<0].describe()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 30, Finished, Available, Finished)

Unnamed: 0,pickup_datetime,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,cbd_congestion_fee,week_number
count,487,487.0,487.0,487.0,487.0,487.0,487.0,487.0,487.0,487.0,487.0,487.0,487.0
mean,2025-06-17 04:09:50.108829440,2.55,1007.78,-4.87,0.55,0.43,1.52,0.54,0.08,0.24,12.83,0.2,24.59
min,2025-06-01 00:17:56,0.0,49.0,-58.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
25%,2025-06-09 15:30:12,0.52,271.5,-6.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0
50%,2025-06-17 23:50:32,1.11,479.0,-2.75,0.0,0.28,0.98,0.0,0.0,0.0,6.34,0.0,25.0
75%,2025-06-24 13:55:06,2.34,836.0,-1.22,0.0,0.46,1.63,0.0,0.0,0.0,12.5,0.0,26.0
max,2025-06-30 23:35:09,38.29,25793.0,-0.02,20.0,9.25,32.82,2.75,2.5,10.0,274.63,1.5,27.0
std,,4.29,2139.56,5.98,2.5,0.83,2.93,1.05,0.44,0.97,26.05,0.51,1.27


In [None]:
# Delete rows where pax fare <0 
df = df[df['base_passenger_fare']>0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 31, Finished, Available, Finished)

In [None]:
# Show Driver pay < 0
df[df['driver_pay']<0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 32, Finished, Available, Finished)

Unnamed: 0,hvfhs_license_num,pickup_datetime,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,cbd_congestion_fee,week_number
371658,HV0003,2025-06-01 14:11:27,9.62,2067,52.42,32.12,1.41,7.71,0.0,0.0,0.0,-5.98,Y,N,N,N,N,0.0,22
2199305,HV0003,2025-06-04 17:01:17,28.99,4031,110.87,18.28,2.95,0.0,0.0,0.0,0.0,-13.26,Y,N,N,N,N,0.0,23
3093172,HV0003,2025-06-05 23:27:06,14.06,5839,187.11,23.0,4.64,0.0,0.0,0.0,0.0,-6.75,N,N,N,N,N,1.5,23
3777878,HV0003,2025-06-06 23:52:36,10.49,2580,116.48,6.94,2.88,10.96,0.75,0.0,0.0,-4.4,Y,N,N,N,N,1.5,23
3800824,HV0003,2025-06-06 23:53:35,9.97,3474,201.58,16.06,5.28,0.0,0.0,0.0,0.0,-1.79,N,N,N,N,N,1.5,23
3806900,HV0003,2025-06-06 23:57:36,12.55,4372,158.49,16.06,4.14,0.0,0.0,0.0,0.0,-11.62,N,N,N,N,N,1.5,23
3824211,HV0003,2025-06-07 00:11:11,17.57,2294,97.25,6.94,2.56,9.7,0.0,0.0,0.0,-5.57,N,N,N,N,N,0.0,23
3833993,HV0003,2025-06-07 00:12:24,17.78,4679,197.95,16.06,4.91,0.0,0.0,0.0,0.0,-4.53,N,N,N,N,N,1.5,23
3837624,HV0003,2025-06-07 00:10:04,13.65,5897,224.9,16.06,5.58,0.0,0.0,0.0,0.0,-12.62,N,N,N,N,N,1.5,23
3842090,HV0003,2025-06-07 00:23:52,12.31,6322,220.89,23.0,5.49,0.0,0.0,0.0,0.0,-6.05,N,N,N,N,N,1.5,23


In [None]:
# Describe
df[df['driver_pay']<0].describe()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 33, Finished, Available, Finished)

Unnamed: 0,pickup_datetime,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,cbd_congestion_fee,week_number
count,33,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,2025-06-14 11:16:44.181818368,13.75,4070.85,137.0,14.45,3.4,6.18,0.55,0.0,1.14,-5.91,0.95,24.12
min,2025-06-01 14:11:27,1.78,444.0,14.57,0.0,0.44,0.0,0.0,0.0,0.0,-20.07,0.0,22.0
25%,2025-06-07 00:12:24,8.23,2067.0,83.72,6.94,2.1,0.0,0.0,0.0,0.0,-8.48,0.0,23.0
50%,2025-06-09 17:05:19,10.68,3698.0,121.32,16.06,3.03,4.44,0.0,0.0,0.0,-4.91,1.5,24.0
75%,2025-06-22 10:47:46,15.34,5280.0,172.59,16.06,4.29,8.92,0.0,0.0,0.0,-2.97,1.5,25.0
max,2025-06-29 04:24:02,57.8,20608.0,494.03,32.12,12.35,45.08,2.75,0.0,12.75,-0.22,1.5,26.0
std,,10.28,3521.81,91.34,9.31,2.17,9.06,1.07,0.0,3.24,4.73,0.73,1.32


In [None]:
# Delete driver pay <0 
df = df[df['driver_pay']>0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 34, Finished, Available, Finished)

In [None]:
# Distance <=0 
df[df.trip_miles <= 0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 35, Finished, Available, Finished)

Unnamed: 0,hvfhs_license_num,pickup_datetime,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,cbd_congestion_fee,week_number
295,HV0003,2025-06-01 00:46:10,0.00,228,12.91,0.00,0.30,1.21,2.75,0.00,0.00,8.26,N,N,N,N,N,1.50,22
12342,HV0003,2025-06-01 00:05:37,0.00,15,7.02,0.00,0.07,0.26,0.00,0.00,0.00,5.50,Y,N,N,N,N,0.00,22
19164,HV0003,2025-06-01 00:30:17,0.00,127,14.30,0.00,0.36,1.26,0.00,0.00,0.00,11.00,N,N,N,N,N,0.00,22
25119,HV0003,2025-06-01 00:20:02,0.00,126,6.42,0.00,0.17,0.61,0.00,0.00,0.00,4.98,N,N,N,Y,Y,0.00,22
30107,HV0003,2025-06-01 00:17:40,0.00,116,5.83,0.00,0.14,0.57,2.75,0.00,0.00,5.25,N,N,N,N,N,1.50,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19809535,HV0003,2025-06-30 21:59:25,0.00,373,6.22,0.00,0.16,0.55,0.00,0.00,0.00,4.00,N,N,N,N,N,0.00,27
19812068,HV0003,2025-06-30 21:48:54,0.00,186,9.24,0.00,0.23,0.80,0.00,0.00,0.00,6.48,N,N,N,N,N,0.00,27
19821838,HV0003,2025-06-30 21:19:03,0.00,181,11.51,0.00,0.24,1.00,2.75,0.00,0.00,6.48,N,N,N,N,N,1.50,27
19854073,HV0005,2025-06-30 23:36:02,0.00,92,6.54,0.00,0.16,0.58,0.00,0.00,0.00,4.00,N,N,N,N,N,0.00,27


In [None]:
# Describe 
df[df.trip_miles <= 0].select_dtypes(['float64', 'int64']).describe()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 36, Finished, Available, Finished)

Unnamed: 0,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,cbd_congestion_fee
count,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0
mean,0.0,301.43,11.68,0.01,0.28,1.03,0.8,0.09,0.31,7.91,0.36
std,0.0,328.58,7.56,0.23,0.19,0.7,1.25,0.46,2.04,6.14,0.64
min,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
25%,0.0,120.0,7.19,0.0,0.18,0.64,0.0,0.0,0.0,4.0,0.0
50%,0.0,192.0,8.93,0.0,0.21,0.78,0.0,0.0,0.0,6.26,0.0
75%,0.0,366.25,12.96,0.0,0.31,1.14,2.75,0.0,0.0,9.6,0.0
max,0.0,3292.0,67.84,10.0,1.7,6.01,2.75,2.5,27.26,72.57,1.5


In [None]:
# Delete rows with trip miles <=0 
df = df[df.trip_miles > 0]

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 37, Finished, Available, Finished)

In [None]:
df.info()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 38, Finished, Available, Finished)

<class 'pandas.core.frame.DataFrame'>
Index: 19847312 entries, 0 to 19868008
Data columns (total 19 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   pickup_datetime       datetime64[ns]
 2   trip_miles            float64       
 3   trip_time             int64         
 4   base_passenger_fare   float64       
 5   tolls                 float64       
 6   bcf                   float64       
 7   sales_tax             float64       
 8   congestion_surcharge  float64       
 9   airport_fee           float64       
 10  tips                  float64       
 11  driver_pay            float64       
 12  shared_request_flag   object        
 13  shared_match_flag     object        
 14  access_a_ride_flag    object        
 15  wav_request_flag      object        
 16  wav_match_flag        object        
 17  cbd_congestion_fee    float64       
 18  week_number           UInt32        
dtypes: 

In [None]:
# Describe df
df.select_dtypes(['float64', 'int64']).describe()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 39, Finished, Available, Finished)

Unnamed: 0,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,cbd_congestion_fee
count,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0,19847312.0
mean,5.13,1220.83,28.06,1.12,0.7,2.33,0.97,0.23,1.24,21.65,0.51
std,6.04,900.13,25.82,3.64,0.67,2.12,1.31,0.72,3.73,19.0,0.71
min,0.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
25%,1.56,600.0,12.68,0.0,0.31,1.04,0.0,0.0,0.0,9.5,0.0
50%,3.04,980.0,20.03,0.0,0.49,1.68,0.0,0.0,0.0,16.07,0.0
75%,6.44,1568.0,33.66,0.0,0.84,2.86,2.75,0.0,0.0,27.3,1.5
max,321.55,34989.0,1450.51,79.82,36.26,130.15,5.5,10.0,237.6,1033.02,3.0


#### Sample Data

In [None]:
# Select random sample of data for training.
train_df = df.sample(2000000, random_state=100)

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 40, Finished, Available, Finished)

In [None]:
# Shape 
print ("shape of initial df:", df.shape)
print ("Shape of training data:", train_df.shape)
print ("% of data in training:", (train_df.shape[0] / df.shape[0])*100)

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 41, Finished, Available, Finished)

shape of initial df: (19847312, 19)
Shape of training data: (2000000, 19)
% of data in training: 10.076931324503793


In [None]:
# Describe 
train_df.select_dtypes(['float64', 'int64']).describe()

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 42, Finished, Available, Finished)

Unnamed: 0,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,cbd_congestion_fee
count,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0,2000000.0
mean,5.12,1218.97,28.0,1.12,0.7,2.33,0.97,0.23,1.24,21.6,0.51
std,6.01,898.32,25.7,3.63,0.67,2.12,1.31,0.72,3.72,18.91,0.71
min,0.01,1.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
25%,1.56,600.0,12.67,0.0,0.31,1.04,0.0,0.0,0.0,9.48,0.0
50%,3.03,978.0,19.99,0.0,0.49,1.68,0.0,0.0,0.0,16.05,0.0
75%,6.42,1566.0,33.57,0.0,0.84,2.86,2.75,0.0,0.0,27.25,1.5
max,266.56,29430.0,1069.28,65.59,26.73,97.29,5.5,10.0,179.14,692.87,3.0


#### Write training data back to lakehouse 

In [None]:
# Write train_df to lakehouse 
train_df.to_parquet("/lakehouse/default/Files/train/train_df.parquet")
#/lakehouse/default/Files/train

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 43, Finished, Available, Finished)

In [None]:
# convert pandas to spark df, write to fabric warehouse as delta table 
spark.createDataFrame(train_df).write.format("delta").mode("overwrite").saveAsTable("training_df")

StatementMeta(, a997e95f-0345-4eb7-95ce-4ebec1cf861c, 44, Finished, Available, Finished)