In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import joblib
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv("F:/projects/personal project/uber trip analysis/data/uber_data.csv")
# Copy original data


In [3]:
# Drop rows where 'price' is null (for price prediction)
data = data.dropna(subset=['price'])

# Encode object columns using LabelEncoder
label_encoders = {}
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le
    path = f'F:/projects/personal project/uber trip analysis/encoders/{col}_encoder.pkl'
    joblib.dump(le, path)
    print(f"Saved encoder for column: {col} at {path}")

# Create a binary classification column for surge detection
data['is_surge'] = (data['surge_multiplier'] > 1).astype(int)

# Define features (drop id, timestamp-related, and target columns for specific tasks)
feature_cols = data.drop(columns=['id', 'price', 'cab_type', 'surge_multiplier', 'is_surge']).columns


Saved encoder for column: id at F:/projects/personal project/uber trip analysis/encoders/id_encoder.pkl
Saved encoder for column: datetime at F:/projects/personal project/uber trip analysis/encoders/datetime_encoder.pkl
Saved encoder for column: timezone at F:/projects/personal project/uber trip analysis/encoders/timezone_encoder.pkl
Saved encoder for column: source at F:/projects/personal project/uber trip analysis/encoders/source_encoder.pkl
Saved encoder for column: destination at F:/projects/personal project/uber trip analysis/encoders/destination_encoder.pkl
Saved encoder for column: cab_type at F:/projects/personal project/uber trip analysis/encoders/cab_type_encoder.pkl
Saved encoder for column: product_id at F:/projects/personal project/uber trip analysis/encoders/product_id_encoder.pkl
Saved encoder for column: name at F:/projects/personal project/uber trip analysis/encoders/name_encoder.pkl
Saved encoder for column: short_summary at F:/projects/personal project/uber trip anal

In [4]:
# Split target variables
X = data[feature_cols]
y_price = data['price']
y_cabtype = data['cab_type']
y_surge = data['is_surge']

# Save feature names used for all models
joblib.dump(feature_cols, 'F:/projects/personal project/uber trip analysis/model/feature_names.pkl')

# Split data
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X, y_price, test_size=0.2, random_state=42)
X_train_cab, X_test_cab, y_train_cab, y_test_cab = train_test_split(X, y_cabtype, test_size=0.2, random_state=42)
X_train_surge, X_test_surge, y_train_surge, y_test_surge = train_test_split(X, y_surge, test_size=0.2, random_state=42)

In [5]:
# 1. Train Price Prediction Model
reg_model = RandomForestRegressor(random_state=42)
reg_model.fit(X_train_price, y_train_price)
price_preds = reg_model.predict(X_test_price)
price_mse = mean_squared_error(y_test_price, price_preds)

# Save model
joblib.dump(reg_model, 'F:/projects/personal project/uber trip analysis/model/price_model.pkl')

['F:/projects/personal project/uber trip analysis/model/price_model.pkl']

In [6]:
# 2. Train Cab Type Classification Model
cab_model = RandomForestClassifier(random_state=42)
cab_model.fit(X_train_cab, y_train_cab)
cab_preds = cab_model.predict(X_test_cab)
cab_acc = accuracy_score(y_test_cab, cab_preds)

# Save model
joblib.dump(cab_model, 'F:/projects/personal project/uber trip analysis/model/cabtype_model.pkl')


['F:/projects/personal project/uber trip analysis/model/cabtype_model.pkl']

In [7]:
# 3. Train Surge Detection Model
surge_model = RandomForestClassifier(random_state=42)
surge_model.fit(X_train_surge, y_train_surge)
surge_preds = surge_model.predict(X_test_surge)
surge_acc = accuracy_score(y_test_surge, surge_preds)

# Save model
joblib.dump(surge_model, 'F:/projects/personal project/uber trip analysis/model/surge_model.pkl')


['F:/projects/personal project/uber trip analysis/model/surge_model.pkl']

In [23]:
#Training accuracy
print(f"training acc for surge_model- { surge_acc}")
print(f"training acc for cab_model- {cab_acc}")
print(f"training mse for reg_model- {price_mse}")

training acc for surge_model0.9726793943383805
training acc for cab_model1.0
training acc for reg_model7.454907747687992


In [9]:
# Define the categorical columns
categorical_cols = ['source', 'destination', 'name', 'short_summary', 'long_summary', 'icon']

# Create encoder and fit on categorical columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(data[categorical_cols])

# Save encoder
joblib.dump(encoder, "F:/projects/personal project/uber trip analysis/model/encoder.pkl")

print("✅ Encoder saved as encoder.pkl")


✅ Encoder saved as encoder.pkl


In [21]:
price_mse

7.454907747687992