In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

In [6]:
pickup_data=pd.read_csv("./cleaned_pickup_data.csv")


In [3]:
pickup_data.dropna(inplace=True)

In [4]:
pickup_data.drop(['order_id', 'courier_id','pickup_month', 'city_encoded', 'aoi_type_encoded','courier_id_encoded'], axis=1, inplace=True)

In [7]:
pickup_data['accept_time'] = pd.to_datetime(pickup_data['accept_time'], format='%Y-%m-%d %H:%M:%S')
pickup_data['accept_gps_time'] = pd.to_datetime(pickup_data['accept_gps_time'], format='%Y-%m-%d %H:%M:%S')
pickup_data['pickup_time'] = pd.to_datetime(pickup_data['pickup_time'], format='%Y-%m-%d %H:%M:%S')
pickup_data['pickup_gps_time'] = pd.to_datetime(pickup_data['pickup_gps_time'], format='%Y-%m-%d %H:%M:%S')

pickup_data['eta'] = pd.to_timedelta(pickup_data['eta']).dt.total_seconds()

In [8]:
pickup_data['time_window_start'] = pd.to_datetime(pickup_data['time_window_start'], format='%Y-%m-%d %H:%M:%S')
pickup_data['time_window_end'] = pd.to_datetime(pickup_data['time_window_end'], format='%Y-%m-%d %H:%M:%S')

In [9]:
pickup_data['accept_time_day'] = pickup_data['accept_time'].dt.day
pickup_data['accept_time_month'] = pickup_data['accept_time'].dt.month
pickup_data['accept_time_hour'] = pickup_data['accept_time'].dt.hour

pickup_data['accept_gps_time_day'] = pickup_data['accept_gps_time'].dt.day
pickup_data['accept_gps_time_month'] = pickup_data['accept_gps_time'].dt.month
pickup_data['accept_gps_time_hour'] = pickup_data['accept_gps_time'].dt.hour

pickup_data['pickup_time_day'] = pickup_data['pickup_time'].dt.day
pickup_data['pickup_time_month'] = pickup_data['pickup_time'].dt.month
pickup_data['pickup_time_hour'] = pickup_data['pickup_time'].dt.hour

pickup_data['pickup_gps_time_day'] = pickup_data['pickup_gps_time'].dt.day
pickup_data['pickup_gps_time_month'] = pickup_data['pickup_gps_time'].dt.month
pickup_data['pickup_gps_time_hour'] = pickup_data['pickup_gps_time'].dt.hour

pickup_data['time_window_start_day'] = pickup_data['time_window_start'].dt.day
pickup_data['time_window_start_month'] = pickup_data['time_window_start'].dt.month
pickup_data['time_window_start_hour'] = pickup_data['time_window_start'].dt.hour

pickup_data['time_window_end_day'] = pickup_data['time_window_end'].dt.day
pickup_data['time_window_end_month'] = pickup_data['time_window_end'].dt.month
pickup_data['time_window_end_hour'] = pickup_data['time_window_end'].dt.hour

In [10]:
for col in pickup_data.select_dtypes(include=['int64', 'int32']).columns:
    pickup_data[col] = np.log1p(pickup_data[col])

In [11]:
len(pickup_data)
pickup_data.drop(['accept_time', 'accept_gps_time', 'pickup_time', 'pickup_gps_time', 'time_window_start', 'time_window_end'], axis=1, inplace=True)


In [12]:
pickup_num_feat = [col for col in pickup_data.columns if pickup_data[col].dtype != 'object']

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = StandardScaler()

# Apply transformation to the numerical features
pickup_data[pickup_num_feat] = scaler.fit_transform(pickup_data[pickup_num_feat])

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
pickup_data['city'] = le.fit_transform(pickup_data['city'])

In [15]:
from sklearn.model_selection import train_test_split
X = pickup_data.drop(['eta'], axis=1)
y= pickup_data['eta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# ----------- Lasso Regression -----------
print("----- Lasso Regression -----")
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
y_lasso_pred = lasso_model.predict(X_test)

lasso_mse = mean_squared_error(y_test, y_lasso_pred)
lasso_r2 = r2_score(y_test, y_lasso_pred)
lasso_cv = cross_val_score(lasso_model, X_train, y_train, cv=5)

print("Mean Squared Error:", lasso_mse)
print("R2 Score:", lasso_r2)
print("Cross Validation Score:", lasso_cv)
print("Average CV Score:", lasso_cv.mean())

# ----------- Random Forest Regressor -----------
print("\n----- Random Forest Regressor -----")
rf_model = RandomForestRegressor(n_estimators=10, random_state=42,n_jobs=-1)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, y_rf_pred)
rf_r2 = r2_score(y_test, y_rf_pred)
rf_cv = cross_val_score(rf_model, X_train, y_train, cv=5)

print("Mean Squared Error:", rf_mse)
print("R2 Score:", rf_r2)
print("Cross Validation Score:", rf_cv)
print("Average CV Score:", rf_cv.mean())


----- Lasso Regression -----
Mean Squared Error: 0.9890172736953925
R2 Score: -3.5175142021337535e-07
Cross Validation Score: [-1.87917609e-06 -1.91144350e-06 -3.22708408e-07 -4.06879882e-07
 -2.26874430e-09]
Average CV Score: -9.044953249492949e-07

----- Random Forest Regressor -----
