# Capacitated Facility Location Problem

In [13]:
test_name = "16 BCFLP"
sample_fraq = 0.1
randomSeed = 11


In [14]:
import os
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopy.distance

# Plotting modules
import plotly.express as px
import plotly.graph_objects as go
from mpl_toolkits.basemap import Basemap

# Linear programming modules
import pulp

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# == Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# == Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# == Clustering
from sklearn.cluster import KMeans
from sklearn_som.som import SOM
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS

# == Neural Networks
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# == Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


## Load the data

In [15]:
# import the data and parameters and check the data

data_path = "data/"
start_time = time.time()
# ====Parameters==== #
parameters = pd.read_csv(os.path.join(data_path, "parametros.csv"))
# 1. food_per_person_per_day in tons per day
food_per_person_per_day = float(
    parameters.loc[parameters["parametro"] == "comida_por_persona_en_toneladas"].values[
        0
    ][1]
)
# 2. transport_cost_per_ton_per_km in COP per ton per km
transport_cost_per_ton_per_km = float(
    parameters.loc[
        parameters["parametro"] == "costo_de_transporte_por_tonelada_por_kilomentro"
    ].values[0][1]
)

print(
    f"parameters: \n    food_per_person_per_day: {food_per_person_per_day} \n    transport_cost_per_ton_per_km: {transport_cost_per_ton_per_km}\n"
)
# ====Parameters==== #

# ====importData==== #
# 1. population, from data/municipios_procesado.csv
population = pd.read_csv(
    os.path.join(data_path, "municipios_procesado.csv"), index_col=3
)
# 2. distance, from data/distance_matrix_final.csv
distance = pd.read_csv(
    os.path.join(data_path, "distance_matrix_final.csv"), index_col=0
)
# 3. warehouses, from data/almacenes.csv
warehouses = pd.read_csv(os.path.join(data_path, "almacenes.csv"))
# ====importData==== #

# ====DataProcessing===== #
# fill the nan values in population with the minimum '2024' from the departamento of Chocó for the columns 22:
population.loc[population.isna().any(axis=1), population.columns[18:]] = (
    population[
        population["2024"]
        == population.loc[population["departamento"] == "Chocó"]["2024"].min()
    ]
    .iloc[0, 18:]
    .values
)
# drop the municipalities with nan values in the first column of the distance matrix
distance = distance.dropna(subset=[distance.columns[0]], axis=0)
distance = distance.dropna(subset=[distance.index[0]], axis=1)
# turn the columns of distance into integers
distance.columns = distance.columns.astype(int)
# Take a sample of the data
distance = distance.sample(frac=sample_fraq, random_state=randomSeed)
distance = distance.loc[
    :, distance.index
]  # make the distance matrix symmetric # type: ignore
# if there is a 0 value that is not in the diagonal, replace it with the mean of the column
for i in distance.index:
    for j in distance.columns:
        if distance.loc[i, j] == 0 and i != j:
            distance.loc[i, j] = np.nan
# Drop the nan values
distance = distance.dropna()
# make the distance matrix symmetric
distance = distance.loc[:, distance.index]
# turn distance to km
distance = distance / 1000
# select only the rows in population dpmp that the index is in distance
population = population.loc[distance.index]
# Check if every value of distance is larger than the linear distance for every pair of municipalities in population lat and lon are in the population dataframe
for i in distance.index:
    for j in distance.columns:
        if i != j:
            # get the linear distance
            linear_distance = geopy.distance.distance(
                (population.loc[i, "lat"], population.loc[i, "lon"]),
                (population.loc[j, "lat"], population.loc[j, "lon"]),
            ).km
            # check if the linear distance is larger than the distance in the distance matrix
            if linear_distance > distance.loc[i, j]:
                distance.loc[i, j] = linear_distance
# Check if there are any nan values in distance
if distance.isna().any().any():
    print("There are nan values in the distance matrix")
print(
    f"data information: \n    population: {population.shape} \n    distance: {distance.shape} \n    warehouses: \n {warehouses} \n"
)
# ====DataProcessing===== #

# ====DataInformation==== #
# Memory usage of the data
print(f"population memory usage: {population.memory_usage().sum()/1024**2:.2f} MB")  # type: ignore
print(f"distance memory usage: {distance.memory_usage().sum()/1024**2:.2f} MB")  # type: ignore
print(f"warehouses memory usage: {warehouses.memory_usage().sum()/1024**2:.2f} MB")  # type: ignore
# ====DataInformation==== #

# ====DataChecking==== #
# Test to the data if needed
# ====DataChecking==== #
print(f"Elapsed time: {time.time() - start_time:.2f} s")


parameters: 
    food_per_person_per_day: 0.00087617 
    transport_cost_per_ton_per_km: 3364.0



data information: 
    population: (112, 71) 
    distance: (112, 112) 
    warehouses: 
    Type  capacity_ton        cost
0     1          1074  3111202.75
1     2          2418  4804980.75 

population memory usage: 0.07 MB
distance memory usage: 0.10 MB
warehouses memory usage: 0.00 MB
Elapsed time: 3.86 s


## 1. Demand Forecast

we are using the data from the population dataset that has the colombian census information since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.

First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
- Multiple Linear Regression.
- Regression Tree.
- Support Vector Machine.
- Random Forest Regression.

Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.

In [16]:
# ## 1. Demand Forecast

# 1. Demand Forecast

data = population.copy().iloc[:, 20:59]
data = data.transpose()
# replace 0 values with nan
data = data.replace(0, np.nan)
# Standardize the data
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)

# List of models to evaluate
models = {
    "Multiple Linear Regression": LinearRegression(),
    "Regression Tree": DecisionTreeRegressor(
        max_depth=100, min_samples_split=2, min_samples_leaf=1, random_state=randomSeed
    ),
    "Support Vector Machine": SVR(C=1.0, kernel="rbf", gamma="scale"),
    "Random Forest Regression": RandomForestRegressor(
        n_estimators=10,
        max_depth=5,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=randomSeed,
    ),
    "Neural Network for population regression": MLPRegressor(
        hidden_layer_sizes=(10, 10, 10, 10, 10),
        activation="logistic",
        solver="adam",
        alpha=0.01,
        batch_size="auto",
        learning_rate="adaptive",
        learning_rate_init=0.01,
        max_iter=1000,
        shuffle=True,
        random_state=randomSeed,
    ),
}

general_results = pd.DataFrame(
    columns=["Municipio", "Modelo", "R2", "MSE", "MAE", "Time"]
)
for municipio in data.columns:
    # print(f"Forecasting demand for {municipio}")
    dataset = data.loc[:, municipio].dropna()
    X = np.array(range(len(dataset))).reshape(-1, 1)
    y = dataset.values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=randomSeed
    )
    for name, model in models.items():
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        general_results = pd.concat(
            [
                general_results,
                pd.DataFrame(
                    [[municipio, name, r2, mse, mae, time.time() - start_time]],
                    columns=["Municipio", "Modelo", "R2", "MSE", "MAE", "Time"],
                ),
            ],
            ignore_index=True,
        )
        # print(
        #     f"Model: {name} \n    R2: {r2} \n    MSE: {mse} \n    MAE: {mae} \n    Time: {time.time() - start_time:.2f} s"
        # )
general_results = general_results[general_results["R2"] > 0.9]
# Group the results by model being count for the number of municipalities and the min, mean, std, and max for the R2, MSE, MAE, and Time
dataframe_de_resultados = (
    general_results.groupby("Modelo")
    .agg(
        {
            "Municipio": "count",
            "R2": ["min", "mean", "std", "max"],
            "MSE": ["min", "mean", "std", "max"],
            "MAE": ["min", "mean", "std", "max"],
            "Time": ["min", "mean", "std", "max", "sum"],
        }
    )
    .reset_index()
).sort_values(("Municipio", "count"), ascending=False)
dataframe_de_resultados = dataframe_de_resultados.round(3)

display(dataframe_de_resultados)


  general_results = pd.concat(


Unnamed: 0_level_0,Modelo,Municipio,R2,R2,R2,R2,MSE,MSE,MSE,MSE,MAE,MAE,MAE,MAE,Time,Time,Time,Time,Time
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,mean,std,max,min,mean,std,max,min,mean,std,max,min,mean,std,max,sum
2,Random Forest Regression,105,0.932,0.981,0.014,0.998,0.001,0.012,0.008,0.038,0.024,0.079,0.026,0.155,0.011,0.016,0.005,0.041,1.698
3,Regression Tree,102,0.925,0.971,0.015,0.993,0.008,0.018,0.009,0.055,0.076,0.114,0.021,0.192,0.002,0.003,0.001,0.005,0.267
4,Support Vector Machine,102,0.931,0.986,0.013,0.998,0.001,0.009,0.009,0.05,0.025,0.07,0.028,0.151,0.002,0.003,0.001,0.013,0.267
0,Multiple Linear Regression,75,0.901,0.969,0.028,1.0,0.0,0.02,0.018,0.074,0.01,0.104,0.06,0.253,0.002,0.003,0.002,0.013,0.195
1,Neural Network for population regression,26,0.935,0.988,0.016,0.999,0.0,0.007,0.009,0.038,0.011,0.057,0.036,0.154,0.108,0.15,0.024,0.191,3.9
