# Uncapacitated Facility Location Problem

In [1]:
import os
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

sys.path.append("..")

# Personal modules
from lp.uflp import UFLP
from lp.cflp import CFLP
from lp.telp import TELP

# Plotting modules
import plotly.express as px
import plotly.graph_objects as go

# Linear programming modules
import pulp

# Machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# == Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# == Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# == Clustering
from sklearn.cluster import KMeans
from sklearn_som.som import SOM
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS

# == Neural Networks
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# == Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


## Load the data

In [2]:
# import the data and parameters and check the data

data_path = "data/"
start_time = time.time()
# ====Parameters==== #
parameters = pd.read_csv(os.path.join(data_path, "parametros.csv"))
# 1. food_per_person_per_day in tons per day
food_per_person_per_day = float(
    parameters.loc[parameters["parametro"] == "comida_por_persona_en_toneladas"].values[
        0
    ][1]
)
# 2. transport_cost_per_ton_per_km in COP per ton per km
transport_cost_per_ton_per_km = float(
    parameters.loc[
        parameters["parametro"] == "costo_de_transporte_por_tonelada_por_kilomentro"
    ].values[0][1]
)

print(
    f"parameters: \n    food_per_person_per_day: {food_per_person_per_day} \n    transport_cost_per_ton_per_km: {transport_cost_per_ton_per_km}\n"
)
# ====Parameters==== #

# ====importData==== #
# 1. population, from data/municipios_procesado.csv
population = pd.read_csv(
    os.path.join(data_path, "municipios_procesado.csv"), index_col=3
)
# 2. distance, from data/distance_matrix_final.csv
distance = pd.read_csv(
    os.path.join(data_path, "distance_matrix_final.csv"), index_col=0
)
# 3. warehouses, from data/almacenes.csv
warehouses = pd.read_csv(os.path.join(data_path, "almacenes.csv"))
# ====importData==== #

# ====DataProcessing===== #
# fill the nan values in population with the minimum '2024' from the departamento of Chocó for the columns 22:
population.loc[population.isna().any(axis=1), population.columns[18:]] = (
    population[
        population["2024"]
        == population.loc[population["departamento"] == "Chocó"]["2024"].min()
    ]
    .iloc[0, 18:]
    .values
)
# drop the municipalities with nan values in the first column of the distance matrix
distance = distance.dropna(subset=[distance.columns[0]], axis=0)
distance = distance.dropna(subset=[distance.index[0]], axis=1)
# select only the rows in population dpmp that the index is in distance
population = population.loc[distance.index]
# turn the columns of distance into integers
distance.columns = distance.columns.astype(int)
print(
    f"data information: \n    population: {population.shape} \n    distance: {distance.shape} \n    warehouses: \n {warehouses} \n"
)
# ====DataProcessing===== #

# ====DataInformation==== #
# Memory usage of the data
print(f"population memory usage: {population.memory_usage().sum()/1024**2:.2f} MB")
print(f"distance memory usage: {distance.memory_usage().sum()/1024**2:.2f} MB")
print(f"warehouses memory usage: {warehouses.memory_usage().sum()/1024**2:.2f} MB")
# ====DataInformation==== #

# ====DataChecking==== #
# Test to the data if needed
# ====DataChecking==== #
print(f"Elapsed time: {time.time() - start_time:.2f} s")


parameters: 
    food_per_person_per_day: 0.00087617 
    transport_cost_per_ton_per_km: 3364.0

data information: 
    population: (1117, 71) 
    distance: (1117, 1117) 
    warehouses: 
    Type  capacity_ton        cost
0     1          1074   731159925
1     2          2418  1129212606 

population memory usage: 0.61 MB
distance memory usage: 9.56 MB
warehouses memory usage: 0.00 MB
Elapsed time: 0.36 s


## 1. Demand Forecast

we are using the data from the population dataset that has the colombian census information since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.

First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
- Multiple Linear Regression.
- Regression Tree.
- Support Vector Machine.
- Random Forest Regression.

Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.

In [3]:
# ## 1. Demand Forecast

# 1. Demand Forecast
from pandas import RangeIndex


data = population.copy().iloc[:, 20:]
data = data.transpose()

# Standardize the data
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)

# Split the data into train and test sets
train, test = train_test_split(data, test_size=0.2, random_state=0)

# List of models to evaluate
models = {
    "Multiple Linear Regression": LinearRegression(),
    "Regression Tree": DecisionTreeRegressor(
        max_depth=100, min_samples_split=2, min_samples_leaf=1
    ),
    "Support Vector Machine": SVR(C=1.0, kernel="rbf", gamma="scale"),
    # "Random Forest Regression 10, 5": RandomForestRegressor(
    #     n_estimators=10, max_depth=5, min_samples_split=2, min_samples_leaf=1
    # ),
    # "Random Forest Regression 100, 10": RandomForestRegressor(
    #     n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1
    # ),
}

best_model = None
best_score = float("-inf")
results = {}

# Iterate over models
for model_name, model in models.items():
    start_time = time.time()
    mean_absolute_errors = []
    mean_squared_errors = []
    r2_scores = []

    # For each column in the data
    for column in data.columns:
        model.fit(train.index.values.reshape(-1, 1), train[column])
        predictions = model.predict(test.index.values.reshape(-1, 1))
        mean_absolute_errors.append(mean_absolute_error(test[column], predictions))
        mean_squared_errors.append(mean_squared_error(test[column], predictions))
        r2_scores.append(r2_score(test[column], predictions))

    avg_r2_score = np.mean(r2_scores)
    results[model_name] = {
        "time": time.time() - start_time,
        "mean_absolute_error": np.mean(mean_absolute_errors),
        "mean_squared_error": np.mean(mean_squared_errors),
        "r2_score": avg_r2_score,
    }

    if avg_r2_score > best_score:
        best_score = avg_r2_score
        best_model = model

# Print results
for model_name, metrics in results.items():
    print(f"{model_name}: {metrics['time']:.2f} s")
    print(f"    Mean Absolute Error: {metrics['mean_absolute_error']}")
    print(f"    Mean Squared Error: {metrics['mean_squared_error']}")
    print(f"    R2 Score: {metrics['r2_score']}")

print(f"\nBest model: {best_model}")

# Create the forecast per municipality since 1985 to 2024+30 years
# Create a list to store all columns
columns = []
forecast = pd.DataFrame(index=np.arange(1985, 2024 + 30))
# Iterate over each column in data
for column in data.columns:
    best_model.fit(data.index.values.reshape(-1, 1), data[column])
    column_data = pd.DataFrame(
        best_model.predict(np.arange(1985, 2024 + 30).reshape(-1, 1)), columns=[column]
    )
    columns.append(column_data)

# Concatenate all columns at once
forecast = pd.concat(columns, axis=1)


# Inverse the standardization
data = pd.DataFrame(
    scaler.inverse_transform(data), index=data.index, columns=data.columns
)
forecast = pd.DataFrame(
    scaler.inverse_transform(forecast), index=forecast.index, columns=forecast.columns
)
forecast.index = RangeIndex(start=1985, stop=2024 + 30, step=1)

# Plot the forecast
fig = go.Figure()
for column in forecast.columns:
    fig.add_trace(
        go.Scatter(
            x=forecast.index,
            y=forecast[column],
            mode="lines+markers",
            name=column,
        )
    )

fig.update_layout(
    title="Population Forecast",
    xaxis_title="Year",
    yaxis_title="Population",
    legend_title="Municipality",
)
fig.show()

# Select the year 2024+30 and add the population to the population dataframe
population["forecast"] = forecast.loc[2024 + 10]


Multiple Linear Regression: 2.01 s
    Mean Absolute Error: 0.428457750684778
    Mean Squared Error: 0.3664370666800922
    R2 Score: 0.5860169997999521
Regression Tree: 1.77 s
    Mean Absolute Error: 0.09088492122658727
    Mean Squared Error: 0.03251396694585165
    R2 Score: 0.967284451899873
Support Vector Machine: 1.93 s
    Mean Absolute Error: 0.10992893224062007
    Mean Squared Error: 0.05873704491266884
    R2 Score: 0.9431877181416908

Best model: DecisionTreeRegressor(max_depth=100)


## 2. Set facility types

We have 3 types of facilities:
- Type 1: Small facility.
- Type 2: Medium facility.
- Type 3: Large facility.

for each facility type we have the following information:
- Fixed cost.
- Variable cost.

The proposal is to use a mathematical model to determine the number of facilities of each type to minimize the total cost, in order to satisfy the demand. and be able to train a classifier to predict the type of facility that will be needed for the hole country.

### Model

#### Sets
- $I$: Set of municipalities.
- $J$: Types of facilities.

#### Parameters
- $c_{j}$: Capacity of facility type $j$.
- $f_{j}$: Fixed cost of facility type $j$.

- $p_{i}$: Population of municipality $i$.
- $N$: Food Demand per capita in Tonnes.
- $d_{i}$: Demand of municipality $i$ = $p_{i} * N$.
- $r_{i}$: department aggregation of population of municipality $i$.
- $\lambda_{i}$: Multiplier of demand of municipality based on its department population.$

#### Decision Variables
- $x_{ij}$: Number of facilities of type $j$ in municipality $i$.

#### Objective Function
- Minimize the total cost of the facilities.

$$ \text{Min} \sum_{i \in I} \sum_{j \in J} f_{j} * x_{ij} $$

#### Constraints

- Demand constraint: The demand of municipality $i$ must be satisfied.

$$ \sum_{j \in J} x_{ij} * c_{j} \geq d_{i} * \lambda_{i} \quad \forall i \in I $$

- (optional) General capacity:

$$ \sum_{i \in I} \sum_{j \in J} x_{ij} * c_{j} \leq \sum_{i \in I} d_{i} * \lambda_{i} $$

- Non-negativity:

$$ x_{ij} \geq 0 \quad \forall i \in I, \forall j \in J $$

- Integer:

$$ x_{ij} \in \mathbb{Z} \quad \forall i \in I, \forall j \in J $$



In [4]:
# Optimization of the model
p = population.copy()["forecast"]

# ====Parameters==== #
c_j = warehouses["capacity_ton"].values.astype(float)
f_j = warehouses["cost"].values.astype(float)


p_i = p.values.astype(int)
n = food_per_person_per_day * 7
d_i = p_i * n
# Lambda = 2 for the top 32 municipalities, 1 for the rest
lambda_i = np.ones(len(p_i)) * 1.1
lambda_i[np.argsort(p_i)[-32:]] = 1.5
# ====Parameters==== #


# Decision variables: $x_{ij}$
I = range(len(p_i))
J = range(len(c_j))
model = pulp.LpProblem("UFLP", pulp.LpMinimize)
x = pulp.LpVariable.dicts(
    "x", ((i, j) for i in I for j in J), lowBound=0, cat="Integer"
)
y = pulp.LpVariable.dicts("y", ((i, j) for i in I for j in J), cat="Binary")
# Objective function
model += (
    pulp.lpSum(f_j[j] * x[(i, j)] for i in I for j in J),
    "Total cost of the facilities",
)
# Constraints
for i in I:
    model += (
        pulp.lpSum(c_j[j] * x[(i, j)] for j in J) >= d_i[i] * lambda_i[i],
        f"Population demand {i}",
    )
    model += (
        pulp.lpSum(x[(i, j)] for j in J) >= 1,
        f"Facility assignment {i}",
    )
    model += (
        pulp.lpSum(y[(i, j)] for j in J) == 1,
        f"Faacility assignment __ {i}",
    )
    for j in J:
        model += (
            x[(i, j)] <= 100 * y[(i, j)],
            f"Fsacility assignment _ {i} _ {j}",
        )

model += pulp.lpSum(x[(i, j)] * c_j[j] for i in I for j in J) >= pulp.lpSum(
    d_i[i] * lambda_i[i] for i in I
)


# Solve the model
model.solve(
    solver=pulp.PULP_CBC_CMD(
        logPath="__workflowsolution_2.log",
        msg=False,
        timeLimit=5 * 60,
        threads=os.cpu_count(),
    )
)
# Results
print(f"Status: {pulp.LpStatus[model.status]}")
print(f"Objective function: {pulp.value(model.objective)}")
df = pd.DataFrame(
    [[pulp.value(x[(i, j)]) for j in J] for i in I],
    columns=[f"Facility {j}" for j in J],
    index=p.index,
)
df["demand"] = d_i
df["satisfied"] = sum(df[f"Facility {j}"] * c_j[j] for j in J)
df["cost"] = sum(df[f"Facility {j}"] * f_j[j] for j in J)
df["comb"] = df.apply(lambda x: tuple(x[: len(J)]), axis=1)

display(df.comb.unique().size, df.comb.unique())
display(df)


Status: Optimal
Objective function: 936718519587.0


11

array([(0.0, 11.0), (1.0, 0.0), (0.0, 3.0), (0.0, 1.0), (3.0, 0.0),
       (0.0, 6.0), (0.0, 2.0), (0.0, 4.0), (0.0, 30.0), (0.0, 9.0),
       (0.0, 5.0)], dtype=object)

Unnamed: 0_level_0,Facility 0,Facility 1,demand,satisfied,cost,comb
dpmp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5001,0.0,11.0,16872.724616,26598.0,1.242134e+10,"(0.0, 11.0)"
5002,1.0,0.0,138.511963,1074.0,7.311599e+08,"(1.0, 0.0)"
5004,1.0,0.0,18.454769,1074.0,7.311599e+08,"(1.0, 0.0)"
5021,1.0,0.0,32.002985,1074.0,7.311599e+08,"(1.0, 0.0)"
5030,1.0,0.0,208.920984,1074.0,7.311599e+08,"(1.0, 0.0)"
...,...,...,...,...,...,...
17444,1.0,0.0,87.011567,1074.0,7.311599e+08,"(1.0, 0.0)"
52051,1.0,0.0,60.092996,1074.0,7.311599e+08,"(1.0, 0.0)"
76109,3.0,0.0,1990.232421,3222.0,2.193480e+09,"(3.0, 0.0)"
47189,1.0,0.0,881.216739,1074.0,7.311599e+08,"(1.0, 0.0)"


## 3. Set facility capacity

We have m types of facilities with different capacities, the proposal is to use the data generated in the previous step to determine the capacity of each facility type in order to satisfy the demand.
Machine Learning algorithms will be used to Classify the type of facility that will be needed for each municipality.
- Decision Tree.
- Linear Discriminant Analysis.
- Logistic Regression.
- Support Vector Machine.

and deep learning to create a new model. The best model will be used to determine the capacity of each facility type.

The objective is to predict 'satisfied' with the demand of each municipality.


In [5]:
data = pd.merge(df.copy(), population.copy(), left_index=True, right_index=True)
population["demand"] = d_i
population["_satisfied"] = df["satisfied"]
population["_cost"] = df["cost"]


data = data[["demand", "lat", "lon", "satisfied", "cost"]]

# Create table with the unique cases of satisfied and cost values, with a random sample of n of the demand, lat and lon
unique_cases = data.groupby(["satisfied", "cost"]).apply(
    lambda x: x.sample(n=2, random_state=0) if len(x) > 2 else x
)
data = unique_cases.reset_index(drop=True)[["demand", "satisfied"]]
display(data)
le = LabelEncoder()
data["satisfied"] = le.fit_transform(data["satisfied"])
scaler = MinMaxScaler()
data[["demand"]] = scaler.fit_transform(data[["demand"]])
train_x, test_x, train_y, test_y = train_test_split(
    data.drop(columns=["satisfied"]), data["satisfied"], test_size=0.2, random_state=0
)

train_x, test_x = data.drop(columns=["satisfied"]), data.drop(columns=["satisfied"])
train_y, test_y = data["satisfied"], data["satisfied"]

# List of models to evaluate
models = {
    "Decision Tree": DecisionTreeClassifier(
        max_depth=100, min_samples_split=2, min_samples_leaf=1
    ),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(C=1.0, kernel="rbf", gamma="scale"),
    "Deep Neural Network": MLPClassifier(
        hidden_layer_sizes=(100, 100, 100),
        activation="relu",
        solver="adam",
        max_iter=1000,
        random_state=0,
    ),
}

best_model = None
best_score = float("-inf")
results = {}
metrics = {
    "accuracy": [],
    "confusion_matrix": [],
    "classification_report": [],
    "time": [],
}

# Iterate over models
for model_name, model in models.items():
    start_time = time.time()
    model.fit(train_x, train_y)
    predictions = model.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    confusion = confusion_matrix(test_y, predictions)
    classification = classification_report(test_y, predictions, zero_division=0)
    results[model_name] = {
        "time": time.time() - start_time,
        "accuracy": accuracy,
        "confusion_matrix": confusion,
        "classification_report": classification,
    }

    if accuracy > best_score:
        best_score = accuracy
        best_model = model

    metrics["accuracy"].append(accuracy)
    metrics["confusion_matrix"].append(confusion)
    metrics["classification_report"].append(classification)
    metrics["time"].append(time.time() - start_time)

# Print results
for model_name, metrics in results.items():
    print(f"{model_name}: {metrics['time']:.2f} s")
    print(f"    Accuracy: {metrics['accuracy']}")
    print(f"    Confusion Matrix: {metrics['confusion_matrix']}")
    print(f"    Classification Report: {metrics['classification_report']}")

print(f"\nBest model: {best_model}")

# Classify the data for the population data for every model
for model_name, model in models.items():
    population[f"_satisfied {model_name}"] = model.predict(population[["demand"]])
    population[f"_satisfied {model_name}"] = le.inverse_transform(
        population[f"_satisfied {model_name}"]
    )


Unnamed: 0,demand,satisfied
0,191.245131,1074.0
1,175.72816,1074.0
2,1260.70787,2418.0
3,1231.066163,2418.0
4,2097.876039,3222.0
5,1929.912498,3222.0
6,2192.425296,4836.0
7,2557.82849,4836.0
8,3969.964821,7254.0
9,3843.81737,7254.0


Decision Tree: 0.01 s
    Accuracy: 1.0
    Confusion Matrix: [[2 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 2 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 1]]
    Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1

## 4. Propose a k parameter for clustering using Linear Programming

The proposal is to use the data generated in the previous step to determine the number of clusters that will be needed to satisfy the demand. The objective is to minimize the total cost of the facilities.


In [6]:
# Example data (replace with your actual data)
data = population.copy()
data = data.loc[:, ["lat", "lon"]]
data = StandardScaler().fit_transform(data)
data = pd.DataFrame(data, columns=["lat", "lon"], index=population.copy().index)
# Initialize dictionaries to store metrics
metrics = {"inertia": [], "silhouette": [], "davies_bouldin": []}

# Range for the number of clusters
cluster_range = range(2, 20)

# Calculate metrics for each number of clusters
for i in cluster_range:
    model = KMeans(n_clusters=i, random_state=0)
    model.fit(data[["lat", "lon"]])
    metrics["inertia"].append(model.inertia_)
    metrics["silhouette"].append(silhouette_score(data[["lat", "lon"]], model.labels_))
    metrics["davies_bouldin"].append(
        davies_bouldin_score(data[["lat", "lon"]], model.labels_)
    )

# Create interactive plots using Plotly
fig = go.Figure()

# Inertia Plot
fig.add_trace(
    go.Scatter(
        x=list(cluster_range),
        y=metrics["inertia"],
        mode="lines+markers",
        name="Inertia",
        yaxis="y",
    )
)

# Silhouette Plot
fig.add_trace(
    go.Scatter(
        x=list(cluster_range),
        y=metrics["silhouette"],
        mode="lines+markers",
        name="Silhouette Score",
        yaxis="y2",
    )
)

# Davies-Bouldin Plot
fig.add_trace(
    go.Scatter(
        x=list(cluster_range),
        y=metrics["davies_bouldin"],
        mode="lines+markers",
        name="Davies-Bouldin Score",
        yaxis="y2",
    )
)

# Update layout
fig.update_layout(
    title="Clustering Metrics",
    xaxis_title="Number of Clusters",
    yaxis_title="Inertia",
    legend_title="Metric",
    template="plotly_white",
    yaxis=dict(side="left", showgrid=False, zeroline=False, title="Normalized Inertia"),
    yaxis2=dict(
        side="right",
        overlaying="y",
        showgrid=False,
        zeroline=False,
        title="Normalized Score (0-1)",
        title_standoff=10,  # Adjust the distance between the axis title and axis tick labels
    ),
    margin=dict(r=100, t=100),  # Adjust the right and top margins
)

# Show plot
fig.show()


In [7]:
data = population.copy()
dist = distance.copy()
muestra = 0.1
randomSeed = np.random.randint(0, 1000)
dist = dist.sample(frac=muestra, random_state=0, axis=0).sample(
    frac=muestra, random_state=0, axis=1
)


# Select the same rows than the distance matrix
data = data.loc[dist.copy().index]


# Standardize the data
dist = MinMaxScaler().fit_transform(dist)
fixed_cost = max(dist.flatten()) * (1 + muestra)


model = pulp.LpProblem("UFLP", pulp.LpMinimize)


# Decision variables
I = range(len(data))
J = range(len(data))


x = pulp.LpVariable.dicts("x", ((i, j) for i in I for j in J), cat="Binary")
y = pulp.LpVariable.dicts("y", J, cat="Binary")


# Objective function
model += (
    pulp.lpSum(dist[i][j] * x[(i, j)] for i in I for j in J)
    + fixed_cost * pulp.lpSum(y[j] for j in J),
    "Total cost",
)


# Constraints
for i in I:
    model += pulp.lpSum(x[(i, j)] for j in J) == 1, f"Population demand {i}"


for j in J:
    for i in I:
        model += x[(i, j)] <= y[j], f"Facility assignment {i} {j}"


# Solve the model
start_time = time.time()
model.solve(
    solver=pulp.PULP_CBC_CMD(
        logPath="__workflowsolution_4.log",
        msg=False,
        timeLimit=60 * 60,
        threads=os.cpu_count(),
    )
)
print(f"Elapsed time: {time.time() - start_time:.2f} s")
# Results
print(f"Status: {pulp.LpStatus[model.status]}")
print(f"Objective function: {pulp.value(model.objective)}")
print(f"Number of facilities: {pulp.value(pulp.lpSum(y[j] for j in J))}")


df = pd.DataFrame([[pulp.value(x[(i, j)]) for j in J] for i in I], columns=J, index=I)


# Turn the results into labels for the clusters
df["cluster"] = df.idxmax(axis=1)
display(df["cluster"].unique().size / muestra, df["cluster"].unique())


# Plot the results
fig = px.scatter(
    data,
    x="lon",
    y="lat",
    color=df["cluster"],
    title="Facilities location",
    labels={"lat": "Latitude", "lon": "Longitude"},
)
fig.show()


Elapsed time: 1.17 s
Status: Optimal
Objective function: 21.127177247703315
Number of facilities: 7.0


70.0

array([ 13, 111,  85,  63,  28,  74,  84], dtype=int64)

## 5. Solve the CFLP inside every cluster

Here, we will use the clusters to find the local optimum to the cflp inside the clusters generated with the following clustering algorithms:

- KMeans
- SOM
- AgglomerativeClustering
- DBSCAN



In [8]:
data = population.copy()
data = data.loc[:, ["lat", "lon"]]
data = StandardScaler().fit_transform(data)
data = pd.DataFrame(data, columns=["lat", "lon"], index=population.copy().index)

# Initialize dictionaries to store metrics
metrics = {
    "model": [],
    "silhouette": [],
    "davies_bouldin": [],
    "n_clusters": [],
    "time": [],
}

# Range for the number of clusters
n_clusters = 6
# models to evaluate
models = {
    "SOM": SOM(dim=2, random_state=0, m=3, n=2),
    "KMeans": KMeans(n_clusters=n_clusters, random_state=0),
    "Agglomerative": AgglomerativeClustering(n_clusters=n_clusters),
    "DBSCAN": DBSCAN(eps=0.15, min_samples=6),
}

# Calculate metrics for each number of clusters and save the best model
for model_name, model in models.items():
    start_time = time.time()
    if model_name == "SOM":
        model.fit(data[["lat", "lon"]].to_numpy())
        data[f"{model_name}_cluster"] = model.predict(data[["lat", "lon"]].to_numpy())
        metrics["silhouette"].append(
            silhouette_score(data, model.predict(data[["lat", "lon"]].to_numpy()))
        )
        metrics["davies_bouldin"].append(
            davies_bouldin_score(data, model.predict(data[["lat", "lon"]].to_numpy()))
        )
        metrics["n_clusters"].append(
            len(np.unique(model.predict(data[["lat", "lon"]].to_numpy())))
        )
    else:
        model.fit(data[["lat", "lon"]])
        data[f"{model_name}_cluster"] = model.labels_
        metrics["silhouette"].append(silhouette_score(data, model.labels_))
        metrics["davies_bouldin"].append(davies_bouldin_score(data, model.labels_))
        metrics["n_clusters"].append(len(np.unique(model.labels_)))
    metrics["time"].append(time.time() - start_time)

    metrics["model"].append(model_name)


# Create a DataFrame with the results
display(metrics)

# Plot the results in a map that shows the contours of the clusters and of Colombia
for model_name, model in models.items():
    fig = px.scatter(
        data,
        x="lon",
        y="lat",
        color=f"{model_name}_cluster",
        title=f"{model_name} Clustering",
        labels={"lat": "Latitude", "lon": "Longitude"},
    )

    fig.update_geos(
        fitbounds="locations",
        showcountries=True,
        countrycolor="Black",
        showland=True,
        landcolor="LightGreen",
    )

    fig.show()


{'model': ['SOM', 'KMeans', 'Agglomerative', 'DBSCAN'],
 'silhouette': [0.573089559049394,
  0.5605305428259316,
  0.5511707923084272,
  0.019228872396535413],
 'davies_bouldin': [0.6806932474735475,
  0.610126942810624,
  0.8731777764351457,
  1.5197206679752882],
 'n_clusters': [6, 6, 6, 7],
 'time': [0.24700307846069336,
  0.03752279281616211,
  0.06600069999694824,
  0.03900003433227539]}

In [9]:
display(data)
# Size of the clusters for each model
for model_name in models.keys():
    display(data.groupby(f"{model_name}_cluster").size())

# add the information to the population dataframe
for model_name in models.keys():
    population[f"cluster {model_name}"] = data[f"{model_name}_cluster"]


Unnamed: 0_level_0,lat,lon,SOM_cluster,KMeans_cluster,Agglomerative_cluster,DBSCAN_cluster
dpmp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5001,0.245589,-0.574343,3,1,3,0
5002,0.078276,-0.485713,3,1,3,0
5004,0.393190,-0.878473,3,1,3,0
5021,0.288099,-0.280160,3,1,3,0
5030,0.166041,-0.653241,3,1,3,0
...,...,...,...,...,...,...
17444,-0.115324,-0.252903,3,1,3,0
52051,-1.566987,-1.532879,1,3,1,0
76109,-0.651670,-1.503628,1,5,1,-1
47189,2.059002,0.251037,5,2,0,-1


SOM_cluster
0     93
1    230
2    152
3    164
4    268
5    210
dtype: int64

KMeans_cluster
0     29
1    305
2    198
3     95
4    368
5    122
dtype: int64

Agglomerative_cluster
0    225
1    191
2    208
3    276
4     29
5    188
dtype: int64

DBSCAN_cluster
-1    176
 0    729
 1     10
 2      5
 3    140
 4     43
 5     14
dtype: int64

In [18]:
from cProfile import label
from calendar import c
from matplotlib.pylab import f
from matplotlib.pyplot import margins
from seaborn import color_palette
from sklearn.cluster import KMeans

data = population.copy()[
    [
        "demand",
        "_satisfied",
        "_cost",
        "lat",
        "lon",
    ]
]
dist = distance.copy()
print(f"size: {data.shape}")

# The algorithm follows the following steps:
# 1. Split the data into clusters
# 2. for each clusters, check if the size is greater than 1 and lower than max_chunk_size
# 2.2. If the cluster size is greater than max_chunk_size, split the cluster into int(cluster_size/max_chunk_size + 1) clusters until all clusters have a size lower than max_chunk_size
# 3. For each cluster, check if the CFLP is feasible
# 3.1. If the CFLP is not feasible, add warehouses until the CFLP is feasible# 4.2.
# 3.2. If the CFLP is feasible, run the CFLP
# 4. Create a new dataframe with the results of the CFLP
# 5. plot the results of the CFLP
# 6. Save the results of the CFLP


max_chunk_size = 200
start_n_clusters = 7
model = KMeans(n_clusters=start_n_clusters, random_state=0)

# 1. Split the data into clusters
model.fit(data[["lat", "lon"]])
data["cluster"] = model.labels_

# 2. for each clusters, check if the size is greater than 1 and lower than max_chunk_size
if data.groupby("cluster").size().max() > max_chunk_size:
    while data.groupby("cluster").size().max() > max_chunk_size:
        for cluster in data.groupby("cluster"):
            if cluster[1].shape[0] > max_chunk_size:
                n_clusters = int(cluster[1].shape[0] / max_chunk_size) + 1
                model = KMeans(n_clusters=n_clusters, random_state=0)
                model.fit(cluster[1][["lat", "lon"]])
                data.loc[cluster[1].index, "cluster"] = (
                    model.labels_ + data["cluster"].max() + 1
                )
for cluster in data.groupby("cluster"):
    print(f"cluster {cluster[0]}: {cluster[1].shape[0]} municipalities")

# 3. For each cluster, check if the CFLP is feasible
for cluster in data.groupby

# Show the sum of all the clusters
print(data.groupby("cluster").size().sum())

# 5. plot the results of the CFLP in a map with the contours of Colombia
fig = go.Figure()
fig.add_trace(
    go.Scattermapbox(
        lat=data["lat"],
        lon=data["lon"],
        mode="markers",
        marker=dict(
            size=15,
            color=data["cluster"],
        ),
        text=data["cluster"],
    )
)

fig.update_layout(
    mapbox=dict(
        style="carto-positron",
        center=dict(lon=-74, lat=4),
        zoom=4,
    ),
    showlegend=False,
    margin=dict(l=0, r=0, t=0, b=0),
)
fig.show()


size: (1117, 5)
cluster 1: 191 municipalities
cluster 2: 198 municipalities
cluster 3: 156 municipalities
cluster 4: 155 municipalities
cluster 5: 155 municipalities
cluster 6: 26 municipalities
cluster 8: 34 municipalities
cluster 9: 109 municipalities
cluster 10: 93 municipalities
1117


1117
