In [None]:
# This would be the main notebook that would be run to execute the entire project

## Linear Regression:

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')

# Split the data into independent and dependent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Linear Regression model and fit the data
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict the values for the test set
y_pred = lr.predict(X_test)

# Compute the R-squared score for the Linear Regression model
accuracy = r2_score(y_test, y_pred)
print("Accuracy of Linear Regression model:", accuracy)


Accuracy of Linear Regression model: 0.14943202276948986


## MeanAbsoluteError

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')

# Extract the relevant columns
data = data[['region_id', 'time_slot', 'order_gap']]

# Group by region and time slot, and calculate the mean actual and predicted gaps
actual_predicted_gaps = data.groupby(['region_id', 'time_slot']).agg({'order_gap': ['mean']})
actual_predicted_gaps.columns = ['actual_gap']

# Train a Linear Regression model on the data
lr = LinearRegression()
X = data.groupby(['region_id', 'time_slot']).mean().reset_index().drop(['region_id', 'time_slot'], axis=1)
y = data.groupby(['region_id', 'time_slot']).agg({'order_gap': ['mean']})
y.columns = ['order_gap']
lr.fit(X, y)

# Predict the order gap for each region and time slot
X_pred = data.groupby(['region_id', 'time_slot']).mean().reset_index().drop(['region_id', 'time_slot'], axis=1)
y_pred = lr.predict(X_pred)
actual_predicted_gaps['predicted_gap'] = y_pred

# Calculate the mean absolute error
mae = np.mean(np.abs(actual_predicted_gaps['actual_gap'] - actual_predicted_gaps['predicted_gap']))

# Print the result
print('Mean Absolute Error:', mae)

Mean Absolute Error: 6.2761405127911265e-15


## Decision Tree Regression:

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')
# Split the data into independent and dependent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Decision Tree Regression model and fit the data
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

# Predict the values for the test set
y_pred = dtr.predict(X_test)

# Compute the R-squared score for the Decision Tree Regression model
accuracy = r2_score(y_test, y_pred)
print("Accuracy of Decision Tree Regression model:", accuracy)

Accuracy of Decision Tree Regression model: 0.9998918457970413


## Random Forest Regression:

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')

# Split the data into independent and dependent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Random Forest Regression model and fit the data
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_train, y_train)

# Predict the values for the test set
y_pred = rfr.predict(X_test)

# Compute the R-squared score for the Random Forest Regression model
accuracy = r2_score(y_test, y_pred)
print("Accuracy of Random Forest Regression model:", accuracy)

Accuracy of Random Forest Regression model: 0.999856859265141


## XGBoost Regression:

In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')

# Split the data into independent and dependent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create an XGBoost Regression model and fit the data
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

# Predict the values for the test set
y_pred = xgbr.predict(X_test)


# Compute the R-squared score for the XGBoost Regression model
accuracy = r2_score(y_test, y_pred)
print("Accuracy of Random Forest Regression model:", accuracy)

Accuracy of Random Forest Regression model: 0.9997347402581984


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

# Read the CSV file
data = pd.read_csv('../dataset/processedData/orders_weather_cluster_map_poi.csv')

# Split the data into independent and dependent variables
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Linear Regression model and fit the data
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict the values for the test set
y_pred = lr.predict(X_test)

# Calculate the mean absolute error
actual_gaps = y_test
predicted_gaps = y_pred
num_regions = len(data['region_id'].unique())
num_time_slots = len(data['time_slot'].unique())

total_mae = 0

for region in range(num_regions):
    for time_slot in range(num_time_slots):
        actual_gap = actual_gaps[(data['region_id'] == region) & (data['time_slot'] == time_slot)]
        predicted_gap = predicted_gaps[(data['region_id'] == region) & (data['time_slot'] == time_slot)]
        abs_diff = abs(actual_gap - predicted_gap)
        total_mae += abs_diff

mae = total_mae / (num_regions * num_time_slots)

print('Mean Absolute Error:', mae)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 8926 but corresponding boolean dimension is 44629

Mean Absolute Error: 6.2761405127911265e-15
