In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [9]:
data = pd.read_csv(r'D:\Ola_Data_Analysis_Project\OLA_Booking_Data\booking_data.csv')

In [10]:
df = data.copy()

In [11]:
print(df.head(10))
print('-' * 100)
print(df.tail(10))
print('-' * 100)
print(df.info())

       Date     Time  Booking ID       Booking Status Customer ID  \
0  6/1/2025  0:00:00  CNR7019659              Success    CID56465   
1  6/1/2025  0:00:00  CNR7784516              Success    CID41109   
2  6/1/2025  0:00:00  CNR1133821              Success    CID72820   
3  6/1/2025  0:00:00  CNR8269117              Success    CID17191   
4  6/1/2025  0:00:00  CNR6856424              Success    CID33669   
5  6/1/2025  0:01:00  CNR6311388              Success    CID34966   
6  6/1/2025  0:01:00  CNR5074489  Cancelled by Driver    CID88873   
7  6/1/2025  0:01:00  CNR6066625  Cancelled by Driver    CID98098   
8  6/1/2025  0:01:00  CNR5102876              Success    CID52872   
9  6/1/2025  0:01:00  CNR1162009              Success    CID49651   

  Vehicle Type Pickup Location Drop Location  Avg VTAT  Avg CTAT  \
0         Mini        Madhapur   Musheerabad      10.0      16.0   
1         Auto      Kukatpally      Saidabad      12.0      18.0   
2    Prime SUV   Santosh Nagar     K

In [12]:
# Fill nulls using mean (floored) for columns with symmetric distribution
for col in ['Avg VTAT', 'Avg CTAT', 'Ride Distance', 'Driver Ratings', 'Customer Rating']:
    if df[col].isnull().any():
        # Calculate mean, then apply floor to round down to the nearest integer
        floored_mean = np.floor(df[col].mean())
        df[col].fillna(floored_mean, inplace=True)

# Fill nulls using median (floored) for 'Booking Value' due to its skewed distribution
if df['Booking Value'].isnull().any():
    # Calculate median, then apply floor to round down to the nearest integer
    floored_median = np.floor(df['Booking Value'].median())
    df['Booking Value'].fillna(floored_median, inplace=True)

# Verify that null values have been filled for these columns
print("Null counts after imputation:")
print(df[['Avg VTAT', 'Avg CTAT', 'Booking Value', 'Ride Distance', 'Driver Ratings', 'Customer Rating']].isnull().sum())

Null counts after imputation:
Avg VTAT           0
Avg CTAT           0
Booking Value      0
Ride Distance      0
Driver Ratings     0
Customer Rating    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(floored_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Booking Value'].fillna(floored_median, inplace=True)


In [13]:
# --- Imputation for 'Cancelled Rides' and 'Incomplete Rides' related columns ---

# Fill nulls in numerical 'count' columns with 0
# A null in these often means the event did not occur.
for col in ['Cancelled Rides by Customer', 'Cancelled Rides by Driver', 'Incomplete Rides']:
    if df[col].isnull().any():
        df[col].fillna(0, inplace=True)

# Fill nulls in categorical 'reason' columns with 'Not Applicable'
# This marks instances where no reason was recorded because the event didn't happen.
for col in ['Reason for cancelling by Customer', 'Reason for cancelling by Driver', 'Incomplete Rides Reason']:
    if df[col].isnull().any():
        df[col].fillna('Not Applicable', inplace=True)

# Verify that null values have been filled for all targeted columns
print("Null counts after all imputations:")
print(df.isnull().sum())

Null counts after all imputations:
Date                                 0
Time                                 0
Booking ID                           0
Booking Status                       0
Customer ID                          0
Vehicle Type                         0
Pickup Location                      0
Drop Location                        0
Avg VTAT                             0
Avg CTAT                             0
Cancelled Rides by Customer          0
Reason for cancelling by Customer    0
Cancelled Rides by Driver            0
Reason for cancelling by Driver      0
Incomplete Rides                     0
Incomplete Rides Reason              0
Booking Value                        0
Ride Distance                        0
Driver Ratings                       0
Customer Rating                      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Not Applicable', inplace=True)


In [14]:
#df.to_csv(r'OLA_Booking_Data\booking_data_cleaned.csv', index=False)

#print("Changes saved to 'booking_data_cleaned.csv' successfully.")

In [15]:
# THIS BIT OF CODE IS ESSENTIAL TO KNOW THE SHAPE OF OUR DATAFRAME(ACTUAL DATASET BEFORE CONVERSION)
print("No of Rows in this dataset - ", len(df))
print("-" * 50)
print("No of Columns in this dataset - ", df.shape[1])
print("-" * 50)
print("Size of this dataset - ", df.shape)

No of Rows in this dataset -  100000
--------------------------------------------------
No of Columns in this dataset -  20
--------------------------------------------------
Size of this dataset -  (100000, 20)


In [16]:
# THIS BIT OF CODE IS ESSENTIAL TO KNOW THE COUNT OF UNIQUE VALUES FROM EVERY ATTRIBUTE
print(df.nunique())

Date                                    30
Time                                  1440
Booking ID                           99483
Booking Status                           4
Customer ID                          60488
Vehicle Type                             7
Pickup Location                         50
Drop Location                           50
Avg VTAT                                14
Avg CTAT                                19
Cancelled Rides by Customer              2
Reason for cancelling by Customer        6
Cancelled Rides by Driver                2
Reason for cancelling by Driver          5
Incomplete Rides                         2
Incomplete Rides Reason                  4
Booking Value                        46258
Ride Distance                         2401
Driver Ratings                          21
Customer Rating                         21
dtype: int64
