In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

In [3]:
delivery_data=pd.read_csv("/content/drive/MyDrive/cleaned_delivery_data.csv")

In [4]:
delivery_data.head()

Unnamed: 0,order_id,region_id,city,courier_id,longitude,latitude,aoi_id,aoi_type,accept_time,accept_gps_time,...,delivery_gps_lat,ds,delivery_hour,delivery_day_of_week,delivery_month,city_encoded,aoi_type_encoded,courier_id_encoded,package_volume,eta
0,2031782,10,Chongqing,73,108.71571,30.90228,50,14,1900-10-22 10:26:00,1900-10-22 10:26:00,...,30.96702,1022,17,0,10,0,14,73,30,398.0
1,4285071,10,Chongqing,3605,108.71639,30.90269,50,14,1900-09-07 10:13:00,1900-09-07 10:13:00,...,30.90266,907,15,6,9,0,14,3605,1,3211.0
2,4056800,10,Chongqing,3605,108.71645,30.90259,50,14,1900-06-26 09:49:00,1900-06-26 09:49:00,...,30.90251,626,16,2,6,0,14,3605,1,1814.0
3,3589481,10,Chongqing,3605,108.7165,30.90347,50,14,1900-09-11 11:01:00,1900-09-11 11:01:00,...,30.90341,911,17,3,9,0,14,3605,1,3253.0
4,2752329,10,Chongqing,3605,108.71608,30.90409,50,14,1900-10-01 09:52:00,1900-10-01 09:52:00,...,30.90397,1001,18,0,10,0,14,3605,1,518.0


In [5]:
delivery_data.dropna(inplace=True)

In [6]:
delivery_data.columns

Index(['order_id', 'region_id', 'city', 'courier_id', 'longitude', 'latitude',
       'aoi_id', 'aoi_type', 'accept_time', 'accept_gps_time',
       'accept_gps_lng', 'accept_gps_lat', 'delivery_time',
       'delivery_gps_time', 'delivery_gps_lng', 'delivery_gps_lat', 'ds',
       'delivery_hour', 'delivery_day_of_week', 'delivery_month',
       'city_encoded', 'aoi_type_encoded', 'courier_id_encoded',
       'package_volume', 'eta'],
      dtype='object')

In [7]:
delivery_data.drop(['order_id', 'courier_id','delivery_hour', 'delivery_day_of_week', 'delivery_month', 'city_encoded', 'aoi_type_encoded','courier_id_encoded'], axis=1, inplace=True)

In [8]:
delivery_data.columns

Index(['region_id', 'city', 'longitude', 'latitude', 'aoi_id', 'aoi_type',
       'accept_time', 'accept_gps_time', 'accept_gps_lng', 'accept_gps_lat',
       'delivery_time', 'delivery_gps_time', 'delivery_gps_lng',
       'delivery_gps_lat', 'ds', 'package_volume', 'eta'],
      dtype='object')

In [9]:
delivery_data['accept_time'] = pd.to_datetime(delivery_data['accept_time'], format='%Y-%m-%d %H:%M:%S')
delivery_data['accept_gps_time'] = pd.to_datetime(delivery_data['accept_gps_time'], format='%Y-%m-%d %H:%M:%S')
delivery_data['delivery_time'] = pd.to_datetime(delivery_data['delivery_time'], format='%Y-%m-%d %H:%M:%S')
delivery_data['delivery_gps_time'] = pd.to_datetime(delivery_data['delivery_gps_time'], format='%Y-%m-%d %H:%M:%S')

delivery_data['eta'] = pd.to_timedelta(delivery_data['eta']).dt.total_seconds()

In [10]:
delivery_data['accept_time_day'] = delivery_data['accept_time'].dt.day
delivery_data['accept_time_month'] = delivery_data['accept_time'].dt.month
delivery_data['accept_time_hour'] = delivery_data['accept_time'].dt.hour

delivery_data['accept_gps_time_day'] = delivery_data['accept_gps_time'].dt.day
delivery_data['accept_gps_time_month'] = delivery_data['accept_gps_time'].dt.month
delivery_data['accept_gps_time_hour'] = delivery_data['accept_gps_time'].dt.hour

delivery_data['delivery_time_day'] = delivery_data['delivery_time'].dt.day
delivery_data['delivery_time_month'] = delivery_data['delivery_time'].dt.month
delivery_data['delivery_time_hour'] = delivery_data['delivery_time'].dt.hour

delivery_data['delivery_gps_time_day'] = delivery_data['delivery_gps_time'].dt.day
delivery_data['delivery_gps_time_month'] = delivery_data['delivery_gps_time'].dt.month
delivery_data['delivery_gps_time_hour'] = delivery_data['delivery_gps_time'].dt.hour

delivery_data.drop(['accept_time', 'accept_gps_time', 'delivery_time', 'delivery_gps_time'], axis=1, inplace=True)

In [11]:
delivery_data.dtypes

Unnamed: 0,0
region_id,int64
city,object
longitude,float64
latitude,float64
aoi_id,int64
aoi_type,int64
accept_gps_lng,float64
accept_gps_lat,float64
delivery_gps_lng,float64
delivery_gps_lat,float64


In [12]:
for col in delivery_data.select_dtypes(include=['int64', 'int32']).columns:
    delivery_data[col] = np.log1p(delivery_data[col])

In [13]:
delivery_num_feat = [col for col in delivery_data.columns if delivery_data[col].dtype != 'object']

In [14]:
delivery_num_feat

['region_id',
 'longitude',
 'latitude',
 'aoi_id',
 'aoi_type',
 'accept_gps_lng',
 'accept_gps_lat',
 'delivery_gps_lng',
 'delivery_gps_lat',
 'ds',
 'package_volume',
 'eta',
 'accept_time_day',
 'accept_time_month',
 'accept_time_hour',
 'accept_gps_time_day',
 'accept_gps_time_month',
 'accept_gps_time_hour',
 'delivery_time_day',
 'delivery_time_month',
 'delivery_time_hour',
 'delivery_gps_time_day',
 'delivery_gps_time_month',
 'delivery_gps_time_hour']

In [15]:
scaler = StandardScaler()

# Apply transformation to the numerical features
delivery_data[delivery_num_feat] = scaler.fit_transform(delivery_data[delivery_num_feat])

In [16]:
delivery_data.dtypes

Unnamed: 0,0
region_id,float64
city,object
longitude,float64
latitude,float64
aoi_id,float64
aoi_type,float64
accept_gps_lng,float64
accept_gps_lat,float64
delivery_gps_lng,float64
delivery_gps_lat,float64


In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
delivery_data['city'] = le.fit_transform(delivery_data['city'])


In [18]:
from sklearn.model_selection import train_test_split
X = delivery_data.drop(['eta'], axis=1)
y= delivery_data['eta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

# ----------- Lasso Regression -----------
print("----- Lasso Regression -----")
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
y_lasso_pred = lasso_model.predict(X_test)

lasso_mse = mean_squared_error(y_test, y_lasso_pred)
lasso_r2 = r2_score(y_test, y_lasso_pred)
lasso_cv = cross_val_score(lasso_model, X_train, y_train, cv=5)

print("Mean Squared Error:", lasso_mse)
print("R2 Score:", lasso_r2)
print("Cross Validation Score:", lasso_cv)
print("Average CV Score:", lasso_cv.mean())

# ----------- Random Forest Regressor -----------
print("\n----- Random Forest Regressor -----")
rf_model = RandomForestRegressor(n_estimators=10, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, y_rf_pred)
rf_r2 = r2_score(y_test, y_rf_pred)
rf_cv = cross_val_score(rf_model, X_train, y_train, cv=5)

print("Mean Squared Error:", rf_mse)
print("R2 Score:", rf_r2)
print("Cross Validation Score:", rf_cv)
print("Average CV Score:", rf_cv.mean())


----- Lasso Regression -----
Mean Squared Error: 1.0722846043570728
R2 Score: -6.754999057889677e-07
Cross Validation Score: [-5.63230278e-07 -5.08421645e-07 -2.87456798e-06 -3.78104442e-07
 -1.69183561e-06]
Average CV Score: -1.2032319900878718e-06

----- Random Forest Regressor -----
