In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

data = pd.read_csv("deliverytime.txt")
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# Set the earth's radius (in kilometers)
R = 6371

# Convert degrees to radians
def deg_to_rad(degrees):
    return degrees * (np.pi/180)

# Function to calculate the distance between two points using the haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
    d_lat = deg_to_rad(lat2-lat1)
    d_lon = deg_to_rad(lon2-lon1)
    a = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1)) * np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c
  
# Calculate the distance between each pair of points
data['Distance in km'] = np.nan

for i in range(len(data)):
    data.loc[i, 'Distance in km'] = distcalculate(data.loc[i, 'Restaurant_latitude'], 
                                        data.loc[i, 'Restaurant_longitude'], 
                                        data.loc[i, 'Delivery_location_latitude'], 
                                        data.loc[i, 'Delivery_location_longitude'])

In [None]:
data.head()

In [None]:
figure = px.scatter(data_frame = data, 
                    x="Distance in km",
                    y="Time_taken(min)", 
                    size="Time_taken(min)", 
                    trendline="ols", 
                    title = "Relationship Between Distance and Time Taken")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, 
                    x="Delivery_person_Age",
                    y="Time_taken(min)", 
                    size="Time_taken(min)", 
                    color = "Distance in km",
                    trendline="ols", 
                    title = "Relationship Between Time Taken and Age")
figure.show()

In [None]:
figure = px.scatter(data_frame = data, 
                    x="Delivery_person_Ratings",
                    y="Time_taken(min)", 
                    size="Time_taken(min)", 
                    color = "Distance in km",
                    trendline="ols", 
                    title = "Relationship Between Time Taken and Ratings")
figure.show()

In [None]:
fig = px.box(data, 
             x="Type_of_vehicle",
             y="Time_taken(min)", 
             color="Type_of_order")
fig.show()

In [None]:
#add categorical features
# data.head()
# one_hot_encoded_data = pd.get_dummies(data, columns = ['Type_of_vehicle', 'Type_of_order'])
# one_hot_encoded_data.head()
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
categorical_columns = ['Type_of_order', 'Type_of_vehicle'];
print(categorical_columns)

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(data[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
data_encoded = pd.concat([data, one_hot_df], axis=1)

# Drop the original categorical columns
data_encoded = data_encoded.drop(categorical_columns, axis=1)

# Display the resulting dataframe
# print(f"Encoded Employee data : \n{data_encoded}")
data_encoded.head()

In [None]:
columns_removed = ['ID','Delivery_person_ID','Restaurant_latitude','Restaurant_longitude']
data_encoded = data_encoded.drop(columns_removed,axis =1)

In [None]:
data_y = data_encoded['Time_taken(min)'];
data_x = data_encoded;
data_y.head()

In [None]:
data_x = data_x.drop(['Time_taken(min)'],axis =1)
data_x

In [None]:
# Step 1: Import necessary libraries and modules
import numpy as np
from sklearn.linear_model import Ridge #ridge-regression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assume 'data' is already defined and loaded, with relevant columns as shown in the prompt

# Step 2: Split the data (already done)
#100 90-train 10-test x-3 features, y - time taken
x = np.array(data_x)
y = np.array(data_y)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)

# Step 3: Fit Ridge Regression to the training data
ridge_model = Ridge()
ridge_model.fit(xtrain, ytrain)
ridge_predictions = ridge_model.predict(xtest)

# Step 4: Fit Random Forest Regression to the training data
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(xtrain, ytrain.ravel())  # .ravel() is used to convert ytrain to 1D array
rf_predictions = rf_model.predict(xtest)

# Step 5: Evaluate the performance of both models
ridge_mae = mean_absolute_error(ytest, ridge_predictions)
ridge_mse = mean_squared_error(ytest, ridge_predictions)
ridge_rmse = np.sqrt(ridge_mse)
ridge_r2 = r2_score(ytest, ridge_predictions)

rf_mae = mean_absolute_error(ytest, rf_predictions)
rf_mse = mean_squared_error(ytest, rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(ytest, rf_predictions)

# Print evaluation results
print(f"Ridge Regression - MAE: {ridge_mae}, MSE: {ridge_mse}, RMSE: {ridge_rmse}, R²: {ridge_r2}")
print(f"Random Forest Regression - MAE: {rf_mae}, MSE: {rf_mse}, RMSE: {rf_rmse}, R²: {rf_r2}")


In [None]:
# print("Food Delivery Time Prediction")
# a = int(input("Age of Delivery Partner: "))
# b = float(input("Ratings of Previous Deliveries: "))
# c = int(input("Total Distance: "))

# features = np.array([[a, b, c]])
# print("Predicted Delivery Time in Minutes = ", ridge_model.predict(features))