##### Working Directory

In [1]:
import os
os.getcwd()

'd:\\Python_MachineLearning_Proj\\Regression'

In [2]:
os.chdir(r"C:\Users\thars\Downloads")

##### Import Necessary libraries

In [3]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

data = pd.read_csv(r"C:\Users\thars\Downloads\dynamic_pricing.csv")
print(data.head())

   Number_of_Riders  Number_of_Drivers Location_Category  \
0                90                 45             Urban   
1                58                 39          Suburban   
2                42                 31             Rural   
3                89                 28             Rural   
4                78                 22             Rural   

  Customer_Loyalty_Status  Number_of_Past_Rides  Average_Ratings  \
0                  Silver                    13             4.47   
1                  Silver                    72             4.06   
2                  Silver                     0             3.99   
3                 Regular                    67             4.31   
4                 Regular                    74             3.77   

  Time_of_Booking Vehicle_Type  Expected_Ride_Duration  \
0           Night      Premium                      90   
1         Evening      Economy                      43   
2       Afternoon      Premium                      76  

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.3+ KB


In [5]:
data.shape

(1000, 10)

In [6]:
data.isnull().sum()

Number_of_Riders           0
Number_of_Drivers          0
Location_Category          0
Customer_Loyalty_Status    0
Number_of_Past_Rides       0
Average_Ratings            0
Time_of_Booking            0
Vehicle_Type               0
Expected_Ride_Duration     0
Historical_Cost_of_Ride    0
dtype: int64

##### Exploratery Data Analysis

In [7]:
print(data.describe())

       Number_of_Riders  Number_of_Drivers  Number_of_Past_Rides  \
count       1000.000000        1000.000000           1000.000000   
mean          60.372000          27.076000             50.031000   
std           23.701506          19.068346             29.313774   
min           20.000000           5.000000              0.000000   
25%           40.000000          11.000000             25.000000   
50%           60.000000          22.000000             51.000000   
75%           81.000000          38.000000             75.000000   
max          100.000000          89.000000            100.000000   

       Average_Ratings  Expected_Ride_Duration  Historical_Cost_of_Ride  
count      1000.000000              1000.00000              1000.000000  
mean          4.257220                99.58800               372.502623  
std           0.435781                49.16545               187.158756  
min           3.500000                10.00000                25.993449  
25%           3.8

In [8]:
data.columns

Index(['Number_of_Riders', 'Number_of_Drivers', 'Location_Category',
       'Customer_Loyalty_Status', 'Number_of_Past_Rides', 'Average_Ratings',
       'Time_of_Booking', 'Vehicle_Type', 'Expected_Ride_Duration',
       'Historical_Cost_of_Ride'],
      dtype='object')

In [9]:
# Let's have a look at the relationship between "Expected_Ride_Duration" and "Historical_Cost_of_Ride"
fig = px.scatter(data, x="Expected_Ride_Duration",
                 y="Historical_Cost_of_Ride",
                 title="Relationship Between Expected Ride duration Vs Historical Cost of Ride", 
                 trendline="ols")
fig.update_traces(marker=dict(color="turquoise", size=5), line=dict(color="purple"))
fig.show()

In [10]:
# Let's have a look at the Distribution Historical cost of the ride Based on the Vehicle type
fig = px.box(data, x="Vehicle_Type",
             y="Historical_Cost_of_Ride",
             title="'Historical Cost of Ride Distribution by Vehicle Type'")
fig.update_traces(line=dict(color="skyblue"))
fig.show()

In [11]:
# Let's have look at the Correlatin Matrix
numerical_col = data.select_dtypes(include="number")
corr_matrix = numerical_col.corr()

fig = go.Figure(data=go.Heatmap(z=corr_matrix.values,
                                x=corr_matrix.columns,
                                y=corr_matrix.columns,
                                colorscale="Viridis"))
fig.update_layout(title="Correlation Matrix")
fig.show()

##### Implementing a Dynamic Pricing Stategy

In [12]:
#It will capture high-demand periods and low-supply scenarios to increase prices, 
#while low-demand periods and high-supply situations will lead to price reductions.
import numpy as np
#np.percentile(data['Number_of_Riders'], 75) for checking
high_demand_percentile = 75
low_demand_percentile = 25

#Calulate demand_multiplier based on low and high percentile
data["demand_multiplier"] = np.where(data["Number_of_Riders"] > np.percentile(data["Number_of_Riders"], high_demand_percentile),
                                     data["Number_of_Riders"] / np.percentile(data["Number_of_Riders"], high_demand_percentile),
                                     data["Number_of_Riders"] / np.percentile(data["Number_of_Riders"], low_demand_percentile))

#Calucate supply_multiplier based on low and high percentile
high_supply_multiplier = 75
low_supply_multiplier =25

data["supply_multiplier"] = np.where(data["Number_of_Drivers"] > np.percentile(data["Number_of_Drivers"], high_supply_multiplier),
                                     data["Number_of_Drivers"] / np.percentile(data["Number_of_Drivers"], high_supply_multiplier),
                                     data["Number_of_Drivers"] / np.percentile(data["Number_of_Drivers"], low_supply_multiplier))

# Define Price adjustment factors based on low and high demand/supply
demand_threshold_high = 1.2 #Higher demand threshold 
demand_threshold_lower = 0.8 #Lower demand threshold
supply_threshold_high = 0.8 #High supply threshold
supply_threshold_lower = 1.2 #Lower supply threshold

#Caluclate adjusted_ride_cost for dynamic pricing
data["adjusted_ride_cost"] = data["Historical_Cost_of_Ride"] * (
    np.maximum(data["demand_multiplier"], demand_threshold_lower) *
    np.maximum(data["supply_multiplier"], supply_threshold_high)
)

In [13]:
# Calculate the profit percentage for each ride
data["profit_percentage"] = ((data["adjusted_ride_cost"] - data["Historical_Cost_of_Ride"]) / data["Historical_Cost_of_Ride"]) * 100

# Identify the profit where percentage profit is positive
profitable_rides = data[data["profit_percentage"] > 0]

# Identity loss rides where profit percentage is negative 
loss_rides = data[data["profit_percentage"] < 0]

In [14]:
# Plotting Pie Chart
import plotly.graph_objects as go

# Calculate count of profitable and loss ride
profitable_count = len(profitable_rides)
loss_count = len(loss_rides)

# Create a donut chart to show the profitable and lost rides
labels = ["Profitable Rides", "Loss Rides"]
values = [profitable_count, loss_count]
colors = ["purple", "red"]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4, marker=dict(colors=colors))])
fig.update_layout(title="Probability of rides (Dynimic Pricing Vs Historical Pricing)")
fig.show()

In [15]:
# Let's have a look at the relationship between expected ride duration and cost of the 
# ride based on the dynamics pricing strategy
fig = px.scatter(data,
                 x='Expected_Ride_Duration',
                 y="adjusted_ride_cost",
                 title="Expected Ride Duration Vs Adjusted Ride Duration",
                 trendline="ols")
fig.show()

##### Training a Predictive Model

In [16]:
# Now we implemented a dynamics pricing strategy, let's train a Machine Learning model
# before training a model, let's prepocess the data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
    categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Normalize numeric features
    #scaler = StandardScaler()
    #scaled_data = scaler.fit_transform(data[numeric_features])
    #data[numeric_features] = scaler.transform(data[numeric_features])

    #Handle missing values in categorical features
    data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

data = data_preprocessing_pipeline(data)
print(data.head())

   Number_of_Riders  Number_of_Drivers Location_Category  \
0              90.0               45.0             Urban   
1              58.0               39.0          Suburban   
2              42.0               31.0             Rural   
3              89.0               28.0             Rural   
4              78.0               22.0             Rural   

  Customer_Loyalty_Status  Number_of_Past_Rides  Average_Ratings  \
0                  Silver                  13.0             4.47   
1                  Silver                  72.0             4.06   
2                  Silver                   0.0             3.99   
3                 Regular                  67.0             4.31   
4                 Regular                  74.0             3.77   

  Time_of_Booking Vehicle_Type  Expected_Ride_Duration  \
0           Night      Premium                    90.0   
1         Evening      Economy                    43.0   
2       Afternoon      Premium                    76.0  

In [17]:
# Vehicle type is valueable factor Let's convert into numerical faetures before moving forward
data["Vehicle_Type"] = data["Vehicle_Type"].map({"Premium": 1, 
                                           "Economy": 0})

In [18]:
# Now split the data and train a machine learning model to predict the cost of ride
# Splitting Data
from sklearn.model_selection import train_test_split
x = np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])
y = np.array(data[["adjusted_ride_cost"]])

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# Reshape y to 1D array
y_train = y_train.ravel()
y_test = y_test.ravel()

In [19]:
# Training a Random Forest Regression model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train, y_train)

In [20]:
# Now let's test the machine learing model using some input values
def get_vehicle_type_numeric(vehicle_type):
    vehicle_type_mapping = {
        "Premium": 1,
        "Economy": 0
    }
    vehicle_type_numeric = vehicle_type_mapping.get(vehicle_type)
    return vehicle_type_numeric
  
# Predicting using user input values
def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration):
    vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid vehicle type")
    
    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]])
    predicted_price = model.predict(input_data)
    return predicted_price

# Example prediction using user input values
user_number_of_riders = 50
user_number_of_drivers = 25
user_vehicle_type = "Economy"
Expected_Ride_Duration = 30
predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)
print("Predicted price:", predicted_price)

Predicted price: [283.45054488]


In [21]:
# Here's Comparisions of the Actual and Predicted results
import plotly.graph_objects as go

# Predict on the test set
y_pred = model.predict(x_test)

# Create a scatter plot with actual vs predicted values
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=y_test.flatten(),
    y=y_pred,
    mode='markers',
    name='Actual vs Predicted'
))

# Add a line representing the ideal case
fig.add_trace(go.Scatter(
    x=[min(y_test.flatten()), max(y_test.flatten())],
    y=[min(y_test.flatten()), max(y_test.flatten())],
    mode='lines',
    name='Ideal',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    showlegend=True,
)

fig.show()