In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv("../data/ncr_ride_bookings.csv")


def rename_columns(df):
    """Apply function to lowercase all columns and replace spaces with underscores"""
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    return df


dataset = rename_columns(dataset)

In [None]:
dataset.head()

In [None]:
# for each column, print a random selection of non null values and the number of unique values

for col in dataset.columns:
    print(f"Column: {col}:")
    print(dataset[col].dropna().sample(5).values)
    print(f"Number of unique values: {dataset[col].nunique()}")
    print("\n")

In [None]:
def cast_to_dtypes(df):
    new_df = df.copy()
    new_df["date"] = pd.to_datetime(new_df["date"], errors="coerce")
    new_df["time"] = pd.to_datetime(
        new_df["time"], format="%H:%M:%S", errors="coerce"
    ).dt.time
    # combine date and time columns to datetime
    new_df["datetime"] = pd.to_datetime(
        new_df["Date"].astype(str) + " " + new_df["Time"].astype(str), errors="coerce"
    )
    new_df.drop(columns=["date", "time"], inplace=True)

    new_df["booking_id"] = new_df["booking_id"].astype(pd.StringDtype())
    new_df["booking_status"] = new_df["booking_status"].astype("category")
    new_df["customer_id"] = new_df["customer_id"].astype(pd.StringDtype())
    new_df["vehicle_type"] = new_df["vehicle_type"].astype("category")
    new_df["pickup_location"] = new_df["pickup_location"].astype("category")
    new_df["drop_location"] = new_df["drop_location"].astype("category")
    new_df["avg_vtat"] = pd.to_numeric(new_df["avg_vtat"], errors="coerce")
    new_df["avg_ctat"] = pd.to_numeric(new_df["avg_ctat"], errors="coerce")

    # Add these columns and their types
    new_df["reason_for_cancelling_by_customer"] = new_df[
        "reason_for_cancelling_by_customer"
    ].astype("category")
    new_df["driver_cancellation_reason"] = new_df["driver_cancellation_reason"].astype(
        "category"
    )
    new_df["cancelled_rides_by_driver"] = pd.to_numeric(
        new_df["cancelled_rides_by_driver"], errors="coerce"
    )
    new_df["cancelled_rides_by_customer"] = pd.to_numeric(
        new_df["cancelled_rides_by_customer"], errors="coerce"
    )
    new_df["incomplete_rides"] = pd.to_numeric(
        new_df["incomplete_rides"], errors="coerce"
    )
    new_df["incomplete_rides_reason"] = new_df["incomplete_rides_reason"].astype(
        "category"
    )
    new_df["booking_value"] = pd.to_numeric(new_df["booking_value"], errors="coerce")
    new_df["ride_distance"] = pd.to_numeric(new_df["ride_distance"], errors="coerce")
    new_df["driver_ratings"] = pd.to_numeric(new_df["driver_ratings"], errors="coerce")
    new_df["customer_rating"] = pd.to_numeric(
        new_df["customer_rating"], errors="coerce"
    )
    new_df["payment_method"] = new_df["payment_method"].astype("category")

    return new_df


dataset_typed = cast_to_dtypes(dataset)

In [None]:
print(dataset_typed.info())

In [None]:
# Get number of rows that have no null values in any column

