In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
data_path = Path('./data')

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor



from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import make_pipeline

import pickle

import warnings
warnings.filterwarnings("ignore")

# Build and Test Regression Models

In [2]:
# Load Cleaned Data
df = pd.read_csv(data_path/ 'cleaned_historical_data.csv')
df.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,total_delivery_duration_seconds,created_at_weekday,created_at_weekofyear,avg_item_price
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,2870.715556,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,3779.0,4,6,860.25
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,2989.596156,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,4024.0,1,7,1900.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,2989.596156,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,1781.0,3,4,1900.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,2989.596156,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,3075.0,1,6,1150.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,2989.596156,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,2390.0,6,7,1300.0


### Full model

In [3]:
# Split the data into features and target variable
X = df.drop(columns=['total_delivery_duration_seconds', 'created_at', 'actual_delivery_time', 'market_id', 'store_id'])
y = df['total_delivery_duration_seconds']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (138194, 15)
X_test shape: (59227, 15)
y_train shape: (138194,)
y_test shape: (59227,)


### Linear Regression

In [13]:
# Create a pipeline that scales the features and then applies Linear regression
linear_pipeline = make_pipeline(StandardScaler(), LinearRegression())

# Define the parameter grid for GridSearchCV (no parameters to tune for Linear Regression)
param_grid_linear = {
    'linearregression__fit_intercept': [True, False]
}

# Initialize GridSearchCV with the Linear pipeline and parameter grid
grid_search_linear = GridSearchCV(linear_pipeline, param_grid_linear, cv=3, scoring='neg_mean_squared_error')

# Fit the model on the training data
grid_search_linear.fit(X_train, y_train)

# Get the best model (no parameters to tune for Linear Regression)
best_linear_model = grid_search_linear.best_estimator_

# Save the Linear Regression model
with open(data_path / 'linear_regression_model.pkl', 'wb') as file:
    pickle.dump(best_linear_model, file)


In [14]:
# Predict on the test data
y_pred_ridge = best_linear_model.predict(X_test)

# Calculate the RMSE and R^2 score
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Linear Regression - Root Mean Squared Error (RMSE): {rmse_ridge}")
print(f"Linear Regression - R^2 Score: {r2_ridge}")

Linear Regression - Root Mean Squared Error (RMSE): 2772.01944429843
Linear Regression - R^2 Score: -1.073800449804366


### Polynomial Regression

In [None]:
# Define the degree of the polynomial features
degrees = [2, 3, 4]

# Create a pipeline that first transforms the features to polynomial features, scales them, and then applies Ridge regression
pipeline = make_pipeline(PolynomialFeatures(), StandardScaler(), Ridge())

# Use Grid Search to find the optimal degree and regularization strength
param_grid = {
    'polynomialfeatures__degree': degrees,
    'ridge__alpha': [0.1, 1.0, 10.0]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


# save the model
with open(data_path / 'polynomial_regression_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)


In [None]:
# Predict on the test data
y_pred_poly = best_model.predict(X_test)

# Calculate the RMSE and R^2 score
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
r2_poly = r2_score(y_test, y_pred_poly)

print(f"Optimal Degree: {best_params['polynomialfeatures__degree']}")
print(f"Optimal Alpha: {best_params['ridge__alpha']}")
print(f"Polynomial Regression - Root Mean Squared Error (RMSE): {rmse_poly}")
print(f"Polynomial Regression - R^2 Score: {r2_poly}")

Optimal Degree: 3
Optimal Alpha: 0.1
Polynomial Regression - Root Mean Squared Error (RMSE): 1836.7215226271348
Polynomial Regression - R^2 Score: 0.08953962361224188


### Random Forest Regressor

In [None]:
# Define the parameter grid for GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV with the Random Forest Regressor and parameter grid
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model on the training data
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and model
best_params_rf = grid_search_rf.best_params_
best_rf_model = grid_search_rf.best_estimator_

# Save the best Random Forest model
with open(data_path / 'best_random_forest_regression_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

In [None]:
# Predict on the test data
y_pred_best_rf = best_rf_model.predict(X_test)

# Calculate the RMSE and R^2 score
rmse_best_rf = np.sqrt(mean_squared_error(y_test, y_pred_best_rf))
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"Best Parameters for Random Forest: {best_params_rf}")
print(f"Best Random Forest Regression - Root Mean Squared Error (RMSE): {rmse_best_rf}")
print(f"Best Random Forest Regression - R^2 Score: {r2_best_rf}")

### Lasso Model

In [None]:

# # Create a pipeline that scales the features and then applies Lasso regression
# lasso_pipeline = make_pipeline(StandardScaler(), Lasso())

# # Define the parameter grid for GridSearchCV
# param_grid_lasso = {
#     'lasso__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
#     'lasso__fit_intercept': [True, False]
# }

# # Initialize GridSearchCV with the Lasso pipeline and parameter grid
# grid_search_lasso = GridSearchCV(lasso_pipeline, param_grid_lasso, cv=5, scoring='neg_mean_squared_error')

# # Fit the model on the training data
# grid_search_lasso.fit(X_train, y_train)

# # Get the best model
# best_lasso_model = grid_search_lasso.best_estimator_

# # Save the Lasso Regression model
# with open(data_path / 'lasso_regression_model.pkl', 'wb') as file:
#     pickle.dump(best_lasso_model, file)


In [None]:
# # Predict on the test data
# y_pred_lasso = best_lasso_model.predict(X_test)

# # Calculate the RMSE and R^2 score
# rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
# r2_lasso = r2_score(y_test, y_pred_lasso)

# print(f"Lasso Regression - Root Mean Squared Error (RMSE): {rmse_lasso}")
# print(f"Lasso Regression - R^2 Score: {r2_lasso}")

### XGB Regressor

create a README
write out what the business problem is.
pick a metric : RMSE
figure out the target column
basic eda including summary stats
at last 5 plots
clean the data - get rid of nulls, address errors or conflicts in the data
univariate analysis
multivariate analysis
check for multicollinearity

Run these models:
1. linear regression
2. polynomial regression
3. random forest regressor

Feature importance

metric anlysis for each model
pick the best model
summary of your results in plots and prose
"future directions"

1. what would you do to the modeling if you had more time
2. What else could doordash analyze to improve based on your experience with this data
3. minimum 30 commits to complete