In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

In [2]:
df = pd.read_csv("../data/processed/processed_1.csv")
df.sample(2)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Adjusted_Cost,Profit,Profit_Percentage
10,86,17,Urban,Regular,99,4.69,Morning,Economy,167,669.298626,906.341748,237.043122,35.416645
456,85,37,Rural,Gold,10,3.76,Night,Premium,61,309.082545,293.696168,-15.386376,-4.97808


In [3]:
def preprocess_data(data):
  """
  Preprocesses data for use in the dynamic pricing model

  Args:
      data: A pandas DataFrame containing the ride data

  Returns:
      A preprocessed pandas DataFrame
  """
  # Identify numeric and categorical features
  numeric_features = data.select_dtypes(include=['float', 'int']).columns
  categorical_features = data.select_dtypes(include=['object']).columns

  # Handle missing values in numeric features
  data[numeric_features] = data[numeric_features].fillna(data[numeric_features].median())

  # Handle outliers in numeric features
  for feature in numeric_features:
    q1 = data[feature].quantile(0.25)
    q3 = data[feature].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    data[feature] = np.clip(data[feature], lower_bound, upper_bound)

  # Handle missing values in categorical features (consider alternative methods)
  data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

  return data

In [5]:
processed_df = preprocess_data(df)
processed_df.head(20)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,Adjusted_Cost,Profit,Profit_Percentage
0,90,45.0,Urban,Silver,13,4.47,Night,Premium,90,284.257273,270.971017,-13.286256,-4.674025
1,58,39.0,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,157.709039,-16.165714,-9.297332
2,42,31.0,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,327.065489,-2.72998,-0.82778
3,89,28.0,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,516.434717,46.233485,9.832702
4,78,22.0,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,702.202113,122.52069,21.135866
5,59,35.0,Urban,Silver,83,3.51,Night,Economy,128,339.955361,323.592044,-16.363316,-4.813372
6,93,43.0,Suburban,Regular,44,4.41,Afternoon,Premium,16,104.061541,98.984861,-5.07668,-4.878536
7,62,39.0,Rural,Gold,83,3.59,Afternoon,Premium,47,235.811864,214.874302,-20.937562,-8.878926
8,79,14.0,Rural,Silver,71,3.74,Evening,Economy,128,501.412517,722.839391,221.426873,44.160619
9,42,6.0,Rural,Silver,21,3.85,Night,Premium,128,398.993365,693.838273,294.844908,73.897196


In [7]:
processed_df.to_csv("../data/processed/processed_final.csv", index=False)