# 🚕 Cab Ride Operations Analysis

## Objective  
Evaluate cab ride data to identify trends in fare efficiency, operational performance across cities, and the impact of vehicle age and ride timing on earnings.

## 1. Data Cleaning  
Replaced zero or invalid entries in Fare and Distance with NaN and removed incomplete records to ensure data reliability.

## 2. Feature Engineering  
Created Earnings per Kilometer and derived a categorical feature to separate old and new vehicles based on manufacturing year.

## 3. Grouping and Aggregation  
Analyzed average earnings and trip patterns by city, vehicle age, and time of day to uncover operational differences.

## 4. Data Transformation  
Applied pivot tables for multidimensional comparisons and structured the dataset for targeted analysis.

## 5. Normalization  
Standardized fare and distance variables using Z-score to normalize for scale and detect outliers.

## 6. Insights  
Extracted actionable patterns from grouped metrics, including top-performing cities and time periods, and the influence of vehicle age on ride earnings.


In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd

data = {
    "Ride_ID": [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    "Driver_Age": [25, 30, 45, 28, 35, 60, 40, 50, 22, 38],
    "Vehicle_Year": [2015, 2018, 2012, 2019, 2010, 2008, 2011, 2014, 2020, 2016],
    "City": ["Delhi", "Mumbai", "Delhi", "Bangalore", "Mumbai", "Delhi", "Bangalore", "Delhi", "Mumbai", "Bangalore"],
    "Fare": [300, 450, 0, 600, 500, 0, 700, 0, 800, 650],
    "Distance_km": [10, 15, 0, 20, 18, 0, 25, 0, 30, 22],
    "Time_of_Day": ["Morning", "Evening", "Morning", "Afternoon", "Evening", "Morning", "Night", "Afternoon", "Night", "Morning"]
}

cab = pd.DataFrame(data)

In [3]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day
0,101,25,2015,Delhi,300,10,Morning
1,102,30,2018,Mumbai,450,15,Evening
2,103,45,2012,Delhi,0,0,Morning
3,104,28,2019,Bangalore,600,20,Afternoon
4,105,35,2010,Mumbai,500,18,Evening
5,106,60,2008,Delhi,0,0,Morning
6,107,40,2011,Bangalore,700,25,Night
7,108,50,2014,Delhi,0,0,Afternoon
8,109,22,2020,Mumbai,800,30,Night
9,110,38,2016,Bangalore,650,22,Morning


In [15]:
cab[["Fare", "Distance_km"]] = cab[["Fare", "Distance_km"]].replace("NaN", np.nan)

  cab[["Fare", "Distance_km"]] = cab[["Fare", "Distance_km"]].replace("NaN", np.nan)


In [16]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day
0,101,25,2015,Delhi,300.0,10.0,Morning
1,102,30,2018,Mumbai,450.0,15.0,Evening
2,103,45,2012,Delhi,,,Morning
3,104,28,2019,Bangalore,600.0,20.0,Afternoon
4,105,35,2010,Mumbai,500.0,18.0,Evening
5,106,60,2008,Delhi,,,Morning
6,107,40,2011,Bangalore,700.0,25.0,Night
7,108,50,2014,Delhi,,,Afternoon
8,109,22,2020,Mumbai,800.0,30.0,Night
9,110,38,2016,Bangalore,650.0,22.0,Morning


In [20]:
cab.dropna(inplace=True)

In [21]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day
0,101,25,2015,Delhi,300.0,10.0,Morning
1,102,30,2018,Mumbai,450.0,15.0,Evening
3,104,28,2019,Bangalore,600.0,20.0,Afternoon
4,105,35,2010,Mumbai,500.0,18.0,Evening
6,107,40,2011,Bangalore,700.0,25.0,Night
8,109,22,2020,Mumbai,800.0,30.0,Night
9,110,38,2016,Bangalore,650.0,22.0,Morning


In [22]:
cab.reset_index(drop=True,inplace=True)

In [23]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day
0,101,25,2015,Delhi,300.0,10.0,Morning
1,102,30,2018,Mumbai,450.0,15.0,Evening
2,104,28,2019,Bangalore,600.0,20.0,Afternoon
3,105,35,2010,Mumbai,500.0,18.0,Evening
4,107,40,2011,Bangalore,700.0,25.0,Night
5,109,22,2020,Mumbai,800.0,30.0,Night
6,110,38,2016,Bangalore,650.0,22.0,Morning


In [24]:
cab.groupby("City")["Fare"].mean()

City
Bangalore    650.000000
Delhi        300.000000
Mumbai       583.333333
Name: Fare, dtype: float64

In [25]:
cab.groupby("Time_of_Day")["Fare"].mean()

Time_of_Day
Afternoon    600.0
Evening      475.0
Morning      475.0
Night        750.0
Name: Fare, dtype: float64

In [27]:
cab.groupby("Vehicle_Year")["Ride_ID"].count()

Vehicle_Year
2010    1
2011    1
2015    1
2016    1
2018    1
2019    1
2020    1
Name: Ride_ID, dtype: int64

In [29]:
cab.groupby("Driver_Age")["Fare"].sum()

Driver_Age
22    800.0
25    300.0
28    600.0
30    450.0
35    500.0
38    650.0
40    700.0
Name: Fare, dtype: float64

In [32]:
cab["Earnings_Per_KM"] = cab["Fare"] / cab["Distance_km"]

In [33]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day,Earnings_Per_KM
0,101,25,2015,Delhi,300.0,10.0,Morning,30.0
1,102,30,2018,Mumbai,450.0,15.0,Evening,30.0
2,104,28,2019,Bangalore,600.0,20.0,Afternoon,30.0
3,105,35,2010,Mumbai,500.0,18.0,Evening,27.777778
4,107,40,2011,Bangalore,700.0,25.0,Night,28.0
5,109,22,2020,Mumbai,800.0,30.0,Night,26.666667
6,110,38,2016,Bangalore,650.0,22.0,Morning,29.545455


In [35]:
def Vehicle_Age(x):
    if x<=2014:
        return "Old"
    else:
        return "New"

cab["Vehicle_Age"] = cab["Vehicle_Year"].apply(Vehicle_Age)
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day,Earnings_Per_KM,Vehicle_Age
0,101,25,2015,Delhi,300.0,10.0,Morning,30.0,New
1,102,30,2018,Mumbai,450.0,15.0,Evening,30.0,New
2,104,28,2019,Bangalore,600.0,20.0,Afternoon,30.0,New
3,105,35,2010,Mumbai,500.0,18.0,Evening,27.777778,Old
4,107,40,2011,Bangalore,700.0,25.0,Night,28.0,Old
5,109,22,2020,Mumbai,800.0,30.0,Night,26.666667,New
6,110,38,2016,Bangalore,650.0,22.0,Morning,29.545455,New


In [36]:
cab.groupby("Vehicle_Age")["Fare"].mean()

Vehicle_Age
New    560.0
Old    600.0
Name: Fare, dtype: float64

In [37]:
pivot = cab.pivot_table(values="Fare", index="City", columns="Time_of_Day", aggfunc="mean")
pivot

Time_of_Day,Afternoon,Evening,Morning,Night
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bangalore,600.0,,650.0,700.0
Delhi,,,300.0,
Mumbai,,475.0,,800.0


In [38]:
melted = cab.melt(id_vars=["Ride_ID", "City", "Time_of_Day"], 
                  value_vars=["Fare", "Distance_km", "Earnings_Per_KM"], 
                  var_name="Metric", 
                  value_name="Value")
melted

Unnamed: 0,Ride_ID,City,Time_of_Day,Metric,Value
0,101,Delhi,Morning,Fare,300.0
1,102,Mumbai,Evening,Fare,450.0
2,104,Bangalore,Afternoon,Fare,600.0
3,105,Mumbai,Evening,Fare,500.0
4,107,Bangalore,Night,Fare,700.0
5,109,Mumbai,Night,Fare,800.0
6,110,Bangalore,Morning,Fare,650.0
7,101,Delhi,Morning,Distance_km,10.0
8,102,Mumbai,Evening,Distance_km,15.0
9,104,Bangalore,Afternoon,Distance_km,20.0


In [40]:
city_earnings = cab.groupby("City")["Fare"].sum().sort_values(ascending=False)
cab["City_Rank_By_Earnings"] = cab["City"].map(city_earnings.rank(ascending=False))
cab


Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day,Earnings_Per_KM,Vehicle_Age,City_Rank_By_Earnings
0,101,25,2015,Delhi,300.0,10.0,Morning,30.0,New,3.0
1,102,30,2018,Mumbai,450.0,15.0,Evening,30.0,New,2.0
2,104,28,2019,Bangalore,600.0,20.0,Afternoon,30.0,New,1.0
3,105,35,2010,Mumbai,500.0,18.0,Evening,27.777778,Old,2.0
4,107,40,2011,Bangalore,700.0,25.0,Night,28.0,Old,1.0
5,109,22,2020,Mumbai,800.0,30.0,Night,26.666667,New,2.0
6,110,38,2016,Bangalore,650.0,22.0,Morning,29.545455,New,1.0


In [41]:
cab["Fare_Normalized"] = (cab["Fare"] - cab["Fare"].mean()) / cab["Fare"].std()

In [43]:
cab["Distance_Normalized"] = (cab["Distance_km"] - cab["Distance_km"].mean()) / cab["Distance_km"].std()

In [44]:
cab

Unnamed: 0,Ride_ID,Driver_Age,Vehicle_Year,City,Fare,Distance_km,Time_of_Day,Earnings_Per_KM,Vehicle_Age,City_Rank_By_Earnings,Fare_Normalized,Distance_Normalized
0,101,25,2015,Delhi,300.0,10.0,Morning,30.0,New,3.0,-1.615924,-1.524986
1,102,30,2018,Mumbai,450.0,15.0,Evening,30.0,New,2.0,-0.722914,-0.762493
2,104,28,2019,Bangalore,600.0,20.0,Afternoon,30.0,New,1.0,0.170097,0.0
3,105,35,2010,Mumbai,500.0,18.0,Evening,27.777778,Old,2.0,-0.425243,-0.304997
4,107,40,2011,Bangalore,700.0,25.0,Night,28.0,Old,1.0,0.765438,0.762493
5,109,22,2020,Mumbai,800.0,30.0,Night,26.666667,New,2.0,1.360778,1.524986
6,110,38,2016,Bangalore,650.0,22.0,Morning,29.545455,New,1.0,0.467768,0.304997
