In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('dynamic_pricing.csv')
df.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.2+ KB


In [4]:
df.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


In [5]:
fig = px.scatter(df, x = 'Expected_Ride_Duration', y= 'Historical_Cost_of_Ride',
                 title='Expected Ride Duration vs Historical Cost of Ride', trendline='ols')

fig.show()

In [6]:
fig = px.box(df, x='Vehicle_Type', y='Historical_Cost_of_Ride',
             title='Historical Cost of Ride by Vehicle Type')

fig.show()

In [9]:
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()


In [10]:
fig = go.Figure(data  = go.Heatmap(
    z = corr_matrix.values,
    x = corr_matrix.columns,
    y = corr_matrix.columns,
    colorscale = 'Viridis'
))

fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

In [13]:
import numpy as np

high_demand_threshold = 75
low_demand_threshold = 25

df['demand_multiplier'] = np.where(df['Number_of_Riders'] > np.percentile(df['Number_of_Riders'], high_demand_threshold),
                                     df['Number_of_Riders'] / np.percentile(df['Number_of_Riders'], high_demand_threshold),
                                     df['Number_of_Riders'] / np.percentile(df['Number_of_Riders'], low_demand_threshold))

high_supply_percentile = 75
low_supply_percentile = 25

df['supply_multiplier'] = np.where(df['Number_of_Drivers'] > np.percentile(df['Number_of_Drivers'], low_supply_percentile),
                                     np.percentile(df['Number_of_Drivers'], high_supply_percentile) / df['Number_of_Drivers'],
                                     np.percentile(df['Number_of_Drivers'], low_supply_percentile) / df['Number_of_Drivers'])

demand_threshold_high = 1.2
demand_threshold_low = 0.8
supply_threshold_high = 1.2
supply_threshold_low = 0.8

df['adjusted_ride_cost'] = df['Historical_Cost_of_Ride'] * (
    np.maximum(df['demand_multiplier'], demand_threshold_low) *
    np.maximum(df['supply_multiplier'], supply_threshold_high)
)


In [16]:
df['profit_percentage'] = (df['adjusted_ride_cost'] - df['Historical_Cost_of_Ride']) / df['Historical_Cost_of_Ride'] * 100

profitable_rides = df[df['profit_percentage'] > 0]

loss_rides = df[df['profit_percentage'] < 0]

profitable_count = len(profitable_rides)
loss_count = len(loss_rides)

fig = go.Figure(data=[
    go.Pie(
        labels=['Profitable Rides', 'Loss Rides'],
        values=[profitable_count, loss_count],
        hole=0.3,
        textinfo='label+percent',
        marker=dict(colors=['#00cc96', '#ff6692'])
    )
])

fig.update_layout(title='Distribution of Profitable vs Loss Rides')

In [17]:
fig = px.scatter(df, x='Expected_Ride_Duration', y='adjusted_ride_cost',title='Adjusted Ride Cost vs Expected Ride Duration',trendline='ols')
fig.show()

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
def data_preprocessing_pipeline(df):
    numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # Handling missing values
    df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

    for feature in numeric_features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[feature] = np.where(df[feature] < lower_bound | df[feature] > upper_bound, df[feature].mean(), df[feature])
    
    # missing values in categorical features
    df[categorical_features] = df[categorical_features].fillna(df[categorical_features].mode().iloc[0])

    return df

In [21]:
df['Vehicle_Type'] = df['Vehicle_Type'].map({
    'Premium': 1,
    'Economy' : 0
})

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X = np.array(df[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])
y = np.array(df[["adjusted_ride_cost"]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2025)

y_train = y_train.ravel()
y_test = y_test.ravel()

model = RandomForestRegressor(n_estimators=100, random_state=2025)
model.fit(X_train, y_train)

In [23]:
def get_vehicle_type_numeric(vehicle_type):
    vehicle_type_map = {
        'Premium': 1,
        'Economy': 0
    }
    return vehicle_type_map.get(vehicle_type)


def predict_price(number_of_riders, number_of_drivers, vehicle_type, expected_ride_duration):
    vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid vehicle type. Use 'Premium' or 'Economy'.")
    
    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, expected_ride_duration]])
    predicted_price = model.predict(input_data)
    return predicted_price[0]

In [24]:
user_number_of_riders = 50
user_number_of_drivers = 25
user_vehicle_type = "Economy"
Expected_Ride_Duration = 30
predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)
print("Predicted price:", predicted_price)

Predicted price: 258.62983334926486


In [25]:
y_pred = model.predict(X_test)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=y_test,
    y=y_pred,
    mode='markers',
    name='Predicted vs Actual',
    marker=dict(color='blue', size=5)
))

fig.add_trace(go.Scatter(
    x=[min(y_test.flatten()), max(y_test.flatten())],
    y=[min(y_test.flatten()), max(y_test.flatten())],
    mode='lines',
    name='Ideal',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Predicted vs Actual Ride Costs',
    xaxis_title='Actual Ride Cost',
    yaxis_title='Predicted Ride Cost',
    width=800,
    height=600,
    showlegend=True
)

fig.show()