In [None]:
import pandas as pd

In [25]:
dataset = pd.read_csv("../data/ncr_ride_bookings.csv")


def rename_columns(df):
    """Apply function to lowercase all columns and replace spaces with underscores"""
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df


dataset = rename_columns(dataset)

In [26]:
dataset.head()

Unnamed: 0,date,time,booking_id,booking_status,customer_id,vehicle_type,pickup_location,drop_location,avg_vtat,avg_ctat,...,reason_for_cancelling_by_customer,cancelled_rides_by_driver,driver_cancellation_reason,incomplete_rides,incomplete_rides_reason,booking_value,ride_distance,driver_ratings,customer_rating,payment_method
0,2024-03-23,12:29:38,"""CNR5884300""",No Driver Found,"""CID1982111""",eBike,Palam Vihar,Jhilmil,,,...,,,,,,,,,,
1,2024-11-29,18:01:39,"""CNR1326809""",Incomplete,"""CID4604802""",Go Sedan,Shastri Nagar,Gurgaon Sector 56,4.9,14.0,...,,,,1.0,Vehicle Breakdown,237.0,5.73,,,UPI
2,2024-08-23,08:56:10,"""CNR8494506""",Completed,"""CID9202816""",Auto,Khandsa,Malviya Nagar,13.4,25.8,...,,,,,,627.0,13.58,4.9,4.9,Debit Card
3,2024-10-21,17:17:25,"""CNR8906825""",Completed,"""CID2610914""",Premier Sedan,Central Secretariat,Inderlok,13.1,28.5,...,,,,,,416.0,34.02,4.6,5.0,UPI
4,2024-09-16,22:08:00,"""CNR1950162""",Completed,"""CID9933542""",Bike,Ghitorni Village,Khan Market,5.3,19.6,...,,,,,,737.0,48.21,4.1,4.3,UPI


In [39]:
# for each column, print a random selection of non null values and the number of unique values

for col in dataset.columns:
    print(f"Column: {col}:")
    print(dataset[col].dropna().sample(5).values)
    print(f"Number of unique values: {dataset[col].nunique()}")
    print("\n")

Column: date:
['2024-09-27' '2024-07-28' '2024-02-07' '2024-05-20' '2024-02-24']
Number of unique values: 365


Column: time:
['11:50:42' '15:33:46' '16:52:04' '00:53:32' '20:25:13']
Number of unique values: 62910


Column: booking_id:
['"CNR9355138"' '"CNR3442948"' '"CNR2190634"' '"CNR2078514"'
 '"CNR8576838"']
Number of unique values: 148767


Column: booking_status:
['Incomplete' 'Completed' 'Completed' 'Completed' 'Completed']
Number of unique values: 5


Column: customer_id:
['"CID2536372"' '"CID1610955"' '"CID1393054"' '"CID5031815"'
 '"CID9792814"']
Number of unique values: 148788


Column: vehicle_type:
['Auto' 'Bike' 'Go Mini' 'Go Sedan' 'Go Mini']
Number of unique values: 7


Column: pickup_location:
['Manesar' 'Lok Kalyan Marg' 'Golf Course Road' 'Karkarduma' 'Azadpur']
Number of unique values: 176


Column: drop_location:
['Indraprastha' 'Vinobapuri' 'Anand Vihar ISBT' 'Barakhamba Road'
 'India Gate']
Number of unique values: 176


Column: avg_vtat:
[ 8.   7.3 10.1  3.6  6.

In [43]:
def cast_to_dtypes(df):
    new_df = df.copy()
    # combine date and time columns to datetime
    new_df["datetime"] = pd.to_datetime(new_df["date"] + " " + df["time"])
    new_df.drop(columns=["date", "time"], inplace=True)

    new_df["booking_id"] = new_df["booking_id"].astype(pd.StringDtype())
    new_df["booking_status"] = new_df["booking_status"].astype("category")
    new_df["customer_id"] = new_df["customer_id"].astype(pd.StringDtype())
    new_df["vehicle_type"] = new_df["vehicle_type"].astype("category")
    new_df["pickup_location"] = new_df["pickup_location"].astype("category")
    new_df["drop_location"] = new_df["drop_location"].astype("category")
    new_df["avg_vtat"] = pd.to_numeric(new_df["avg_vtat"], errors="coerce")
    new_df["avg_ctat"] = pd.to_numeric(new_df["avg_ctat"], errors="coerce")

    # Add these columns and their types
    new_df["reason_for_cancelling_by_customer"] = new_df[
        "reason_for_cancelling_by_customer"
    ].astype("category")
    new_df["driver_cancellation_reason"] = new_df["driver_cancellation_reason"].astype(
        "category"
    )
    new_df["cancelled_rides_by_driver"] = pd.to_numeric(
        new_df["cancelled_rides_by_driver"], errors="coerce"
    )
    new_df["cancelled_rides_by_customer"] = pd.to_numeric(
        new_df["cancelled_rides_by_customer"], errors="coerce"
    )
    new_df["incomplete_rides"] = pd.to_numeric(
        new_df["incomplete_rides"], errors="coerce"
    )
    new_df["incomplete_rides_reason"] = new_df["incomplete_rides_reason"].astype(
        "category"
    )
    new_df["booking_value"] = pd.to_numeric(new_df["booking_value"], errors="coerce")
    new_df["ride_distance"] = pd.to_numeric(new_df["ride_distance"], errors="coerce")
    new_df["driver_ratings"] = pd.to_numeric(new_df["driver_ratings"], errors="coerce")
    new_df["customer_rating"] = pd.to_numeric(
        new_df["customer_rating"], errors="coerce"
    )
    new_df["payment_method"] = new_df["payment_method"].astype("category")

    return new_df


dataset_typed = cast_to_dtypes(dataset)

In [44]:
print(dataset_typed.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   booking_id                         150000 non-null  string        
 1   booking_status                     150000 non-null  category      
 2   customer_id                        150000 non-null  string        
 3   vehicle_type                       150000 non-null  category      
 4   pickup_location                    150000 non-null  category      
 5   drop_location                      150000 non-null  category      
 6   avg_vtat                           139500 non-null  float64       
 7   avg_ctat                           102000 non-null  float64       
 8   cancelled_rides_by_customer        10500 non-null   float64       
 9   reason_for_cancelling_by_customer  10500 non-null   category      
 10  cancelled_rides_by_d