In [1]:
import pandas as pd
import numpy as np

# Mock data generation for the machine learning model
num_samples = 1000
np.random.seed(42)

# Generate random data for various factors
distances = np.random.uniform(1, 30, num_samples)  # in kilometers
traffic_congestion = np.random.uniform(0, 100, num_samples)  # congestion level
prices = np.random.uniform(5, 50, num_samples)  # Generate random prices as target variable

# Create DataFrame
data = pd.DataFrame({
    'distances': distances,
    'traffic_congestion': traffic_congestion,
    'prices': prices  # Target variable
})


In [2]:
import numpy as np
import networkx as nx

def calculate_priorities(queue_lengths, alpha):
    num_zones = queue_lengths.shape[0]
    priorities = np.zeros(num_zones)
    for t in range(queue_lengths.shape[1]):
        priorities += alpha ** (queue_lengths.shape[1] - t - 1) * queue_lengths[:, t]
    return priorities

# Define the size of the grid
n = 10

# Generate random travel times (between 1 and 20 minutes)
np.random.seed(0)  # For reproducibility
T = np.random.randint(1, 21, size=(n * n, n * n))

# Generate random queue lengths over time for each zone (100 zones, 10 time steps)
queue_lengths = np.random.randint(0, 10, size=(n * n, 10))

# Define the discount factor
alpha = 0.9

# Calculate the priorities
priorities = calculate_priorities(queue_lengths, alpha)

# Ensure the diagonal is zero (no self-loops)
np.fill_diagonal(T, 0)

# Lambda value for scaling priorities
lambda_value = 1

# Calculate the Combined Cost Matrix (C)
C = T + lambda_value * priorities.reshape(-1, 1)

# Function to calculate DIPR-adjusted travel times
def calculate_dipr_adjusted_travel_times(T, priorities, lambda_value=1):
    C = T + lambda_value * priorities.reshape(-1, 1)
    G = nx.DiGraph(C)
    travel_times = np.zeros_like(T)
    for source in range(T.shape[0]):
        for target in range(T.shape[1]):
            if source != target:
                travel_times[source, target] = nx.shortest_path_length(G, source=source, target=target, weight='weight')
    return travel_times

# Calculate DIPR-adjusted travel times
dipr_adjusted_travel_times = calculate_dipr_adjusted_travel_times(T, priorities)


In [3]:
# Example integration of DIPR-adjusted travel times into the dataset
data['dipr_adjusted_travel_time'] = [dipr_adjusted_travel_times[int(d), int(r)] for d, r in zip(np.random.randint(0, n*n, num_samples), np.random.randint(0, n*n, num_samples))]


In [4]:
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from scipy.stats import uniform, randint

# One-hot encode categorical features (example, adjust as needed)
categorical_features = []
numerical_features = ['distances', 'traffic_congestion', 'dipr_adjusted_travel_time']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Split data into training and testing sets
X = data.drop('prices', axis=1)
y = data['prices']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=0)
}

# Hyperparameter tuning using RandomizedSearchCV
param_dists = {
    "LightGBM": {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 20),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4)
    },
    "CatBoost": {
        'iterations': randint(100, 500),
        'learning_rate': uniform(0.01, 0.3),
        'depth': randint(3, 10),
        'l2_leaf_reg': uniform(1, 10)
    }
}

best_models = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    random_search = RandomizedSearchCV(pipeline, param_dists[model_name], n_iter=100, cv=5, scoring='neg_mean_absolute_error', random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    best_models[model_name] = random_search.best_estimator_
    print(f'Best parameters for {model_name}: {random_search.best_params_}')

    # Evaluate the model
    y_pred = random_search.best_estimator_.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f'{model_name} - Mean Absolute Error: {mae:.2f}')
    print(f'{model_name} - Root Mean Squared Error: {rmse:.2f}')
    print(f'{model_name} - R^2 Score: {r2:.2f}')
    
    # Save the model to a file
    joblib.dump(random_search.best_estimator_, f'best_model_{model_name}.pkl')

# Load a model from the file and evaluate
loaded_model = joblib.load('best_model_LightGBM.pkl')
y_pred_loaded = loaded_model.predict(X_test)
mae_loaded = mean_absolute_error(y_test, y_pred_loaded)
rmse_loaded = np.sqrt(mean_squared_error(y_test, y_pred_loaded))
r2_loaded = r2_score(y_test, y_pred_loaded)

print(f'Loaded Model (LightGBM) - Mean Absolute Error: {mae_loaded:.2f}')
print(f'Loaded Model (LightGBM) - Root Mean Squared Error: {rmse_loaded:.2f}')
print(f'Loaded Model (LightGBM) - R^2 Score: {r2_loaded:.2f}')


KeyError: "['prices'] not found in axis"