# Uncapacitated Facility Location Problem

In [12]:
#
import os
import sys
import time
import pandas as pd
import numpy as np

sys.path.append("..")

# Personal modules
from lp.uflp import UFLP
from lp.cflp import CFLP
from lp.telp import TELP

# Linear programming modules
import pulp

# Machine learning modules
# == Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# == Clustering


## Load the data

In [13]:
# import the data and parameters and check the data

data_path = "data/"
start_time = time.time()
# ====Parameters==== #
parameters = pd.read_csv(os.path.join(data_path, "parametros.csv"))
# 1. food_per_person_per_day in tons per day
food_per_person_per_day = float(
    parameters.loc[parameters["parametro"] == "comida_por_persona_en_toneladas"].values[
        0
    ][1]
)
# 2. transport_cost_per_ton_per_km in COP per ton per km
transport_cost_per_ton_per_km = float(
    parameters.loc[
        parameters["parametro"] == "costo_de_transporte_por_tonelada_por_kilomentro"
    ].values[0][1]
)

print(
    f"parameters: \n    food_per_person_per_day: {food_per_person_per_day} \n    transport_cost_per_ton_per_km: {transport_cost_per_ton_per_km}\n"
)
# ====Parameters==== #

# ====importData==== #
# 1. population, from data/municipios_procesado.csv
population = pd.read_csv(
    os.path.join(data_path, "municipios_procesado.csv"), index_col=3
)
# 2. distance, from data/distance_matrix_final.csv
distance = pd.read_csv(
    os.path.join(data_path, "distance_matrix_final.csv"), index_col=0
)
# 3. warehouses, from data/almacenes.csv
warehouses = pd.read_csv(os.path.join(data_path, "almacenes.csv"))
# ====importData==== #

# ====DataProcessing===== #
# fill the nan values in population with the minimum '2024' from the departamento of Chocó for the columns 22:
population.loc[population.isna().any(axis=1), population.columns[18:]] = (
    population[
        population["2024"]
        == population.loc[population["departamento"] == "Chocó"]["2024"].min()
    ]
    .iloc[0, 18:]
    .values
)
# drop the municipalities with nan values in the first column of the distance matrix
distance = distance.dropna(subset=[distance.columns[0]], axis=0)
distance = distance.dropna(subset=[distance.index[0]], axis=1)
# select only the rows in population dpmp that the index is in distance
population = population.loc[distance.index]
# turn the columns of distance into integers
distance.columns = distance.columns.astype(int)
print(
    f"data information: \n    population: {population.shape} \n    distance: {distance.shape} \n    warehouses: \n {warehouses} \n"
)
# ====DataProcessing===== #

# ====DataInformation==== #
# Memory usage of the data
print(f"population memory usage: {population.memory_usage().sum()/1024**2:.2f} MB")
print(f"distance memory usage: {distance.memory_usage().sum()/1024**2:.2f} MB")
print(f"warehouses memory usage: {warehouses.memory_usage().sum()/1024**2:.2f} MB")
# ====DataInformation==== #

# ====DataChecking==== #
# Test to the data if needed
# ====DataChecking==== #
print(f"Elapsed time: {time.time() - start_time:.2f} s")


parameters: 
    food_per_person_per_day: 0.00087617 
    transport_cost_per_ton_per_km: 3364.0

data information: 
    population: (1117, 71) 
    distance: (1117, 1117) 
    warehouses: 
    Type  capacity_ton        cost
0     1          1074   731159925
1     2          2418  1129212606
2     3          9672  4516850424 

population memory usage: 0.61 MB
distance memory usage: 9.56 MB
warehouses memory usage: 0.00 MB
Elapsed time: 0.29 s


## 1. Demand Forecast

we are using the data from the population dataset that has the colombian census information since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.

First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
- Multiple Linear Regression.
- Regression Tree.
- Support Vector Machine.
- Random Forest Regression.

Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.

In [14]:
# ## 1. Demand Forecast

# we are using the data from the population dataset that has the colombian census information for the population of 1123 municipalities since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.
# aditionally we have data of the latitude and longitude of the municipalities.
# First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
# - Multiple Linear Regression.
# - Regression Tree.
# - Support Vector Machine.
# - Random Forest Regression.

# Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.
data = population.copy().iloc[:, 20:]
data = data.transpose()
# train with a random sample of 80% of the data
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)
# 1.1. Multiple Linear Regression
start_time = time.time()
mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = LinearRegression()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.1. Multiple Linear Regression: {time.time()-start_time:.2f} s")
print(f"    Mean Absolute Error: {np.mean(mean_absolute_errors)}")
print(f"    Mean Squared Error: {np.mean(mean_squared_errors)}")
print(f"    R2 Score: {np.mean(r2_scores)}")

# 1.2. Regression Tree
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = DecisionTreeRegressor()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.2. Regression Tree: {time.time()-start_time:.2f} s")
print(f"    Mean Absolute Error: {np.mean(mean_absolute_errors)}")
print(f"    Mean Squared Error: {np.mean(mean_squared_errors)}")
print(f"    R2 Score: {np.mean(r2_scores)}")

# 1.3. Support Vector Machine
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = svm.SVR()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.3. Support Vector Machine: {time.time()-start_time:.2f} s")
print(f"    Mean Absolute Error: {np.mean(mean_absolute_errors)}")
print(f"    Mean Squared Error: {np.mean(mean_squared_errors)}")
print(f"    R2 Score: {np.mean(r2_scores)}")

# 1.4. Random Forest Regression
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
start_time = time.time()

for i in data.columns:
    model = RandomForestRegressor()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.4. Random Forest Regression:  {time.time()-start_time:.2f} s")
print(f"    Mean Absolute Error: {np.mean(mean_absolute_errors)}")
print(f"    Mean Squared Error: {np.mean(mean_squared_errors)}")
print(f"    R2 Score: {np.mean(r2_scores)}")

# # 1.5 Deep Learning
# start_time = time.time()
# # for the deep learning model we are going to use a simple neural network with 3 layers using the Keras library
# # we are going to use the same data as before
# from keras.models import Sequential
# from keras.layers import Dense
# import tensorflow as tf
# mean_absolute_errors = []
# mean_squared_errors = []
# r2_scores = []
# # for each column in the data
# for i in data.columns:
#     model = Sequential()
#     model.add(Dense(12, input_dim=1, activation='relu'))
#     model.add(Dense(8, activation='relu'))
#     model.add(Dense(1, activation='linear'))
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     model.fit(np.array(train.index, dtype=float), train[i], epochs=150, batch_size=10, verbose=0)
#     predictions = model.predict(np.array(test.index, dtype=float))
#     mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
#     mean_squared_errors.append(mean_squared_error(test[i], predictions))
#     r2_scores.append(r2_score(test[i], predictions))
# print(f"1.5. Deep Learning:  {time.time()-start_time:.2f} s")
# print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
# print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
# print(f'    R2 Score: {np.mean(r2_scores)}')


1.1. Multiple Linear Regression: 2.15 s
    Mean Absolute Error: 1983.0903511087888
    Mean Squared Error: 271714255.3285868
    R2 Score: 0.6550556575024784
1.2. Regression Tree: 1.84 s
    Mean Absolute Error: 506.4348254252462
    Mean Squared Error: 12224571.61817368
    R2 Score: 0.9839632368722134
1.3. Support Vector Machine: 2.04 s
    Mean Absolute Error: 8518.465071851242
    Mean Squared Error: 3061127831.2831564
    R2 Score: -0.22744499257749734
1.4. Random Forest Regression:  111.26 s
    Mean Absolute Error: 377.8431468218442
    Mean Squared Error: 12196956.821772259
    R2 Score: 0.9844701428147238


## 2. Set facility types

We have 3 types of facilities:
- Type 1: Small facility.
- Type 2: Medium facility.
- Type 3: Large facility.

for each facility type we have the following information:
- Fixed cost.
- Variable cost.

The proposal is to use a mathematical model to determine the number of facilities of each type to minimize the total cost, in order to satisfy the demand. and be able to train a classifier to predict the type of facility that will be needed for the hole country.

### Model

#### Sets
- $I$: Set of municipalities.
- $J$: Types of facilities.

#### Parameters
- $c_{j}$: Capacity of facility type $j$.
- $f_{j}$: Fixed cost of facility type $j$.

- $p_{i}$: Population of municipality $i$.
- $N$: Food Demand per capita in Tonnes.
- $d_{i}$: Demand of municipality $i$ = $p_{i} * N$.
- $r_{i}$: department aggregation of population of municipality $i$.
- $\lambda_{i}$: Multiplier of demand of municipality based on its department population.$

#### Decision Variables
- $x_{ij}$: Number of facilities of type $j$ in municipality $i$.

#### Objective Function
- Minimize the total cost of the facilities.

$$ \text{Min} \sum_{i \in I} \sum_{j \in J} f_{j} * x_{ij} $$

#### Constraints

- Demand constraint: The demand of municipality $i$ must be satisfied.

$$ \sum_{j \in J} x_{ij} * c_{j} \geq d_{i} * \lambda_{i} \quad \forall i \in I $$

- (optional) General capacity:

$$ \sum_{i \in I} \sum_{j \in J} x_{ij} * c_{j} \leq \sum_{i \in I} d_{i} * \lambda_{i} $$

- Non-negativity:

$$ x_{ij} \geq 0 \quad \forall i \in I, \forall j \in J $$

- Integer:

$$ x_{ij} \in \mathbb{Z} \quad \forall i \in I, \forall j \in J $$



In [20]:
# Optimization of the model
# ====Parameters==== #
c_j = warehouses["capacity_ton"].values.astype(float)
f_j = warehouses["cost"].values.astype(float)

p_i = population["2024"].sample(frac=0.85, random_state=5).values.astype(int)
n = food_per_person_per_day * 7
d_i = p_i * n

# Lambda = 2 for the top 32 municipalities, 1 for the rest
lambda_i = np.ones(len(p_i)) * 1.1
lambda_i[np.argsort(p_i)[-32:]] = 1.5

print(
    f"Parameters: \n    c_j: {c_j} \n    f_j: {f_j} \n    p_i: {p_i} \n    d_i: {d_i} \n    lambda_i: {lambda_i} \n"
)
# ====Parameters==== #

# Decision variables: $x_{ij}$
I = range(len(p_i))
J = range(len(c_j))
model = pulp.LpProblem("UFLP", pulp.LpMinimize)
x = pulp.LpVariable.dicts(
    "x", ((i, j) for i in I for j in J), lowBound=0, cat="Integer"
)
y = pulp.LpVariable.dicts("y", ((i, j) for i in I for j in J), cat="Binary")
# Objective function
model += (
    pulp.lpSum(f_j[j] * x[(i, j)] for i in I for j in J),
    "Total cost of the facilities",
)
# Constraints
for i in I:
    model += (
        pulp.lpSum(c_j[j] * x[(i, j)] for j in J) >= d_i[i] * lambda_i[i],
        f"Population demand {i}",
    )
    model += (
        pulp.lpSum(x[(i, j)] for j in J) >= 1,
        f"Facility assignment {i}",
    )
    model += (
        pulp.lpSum(y[(i, j)] for j in J) == 1,
        f"Faacility assignment __ {i}",
    )
    for j in J:
        model += (
            x[(i, j)] <= 100 * y[(i, j)],
            f"Fsacility assignment _ {i} _ {j}",
        )

model += pulp.lpSum(x[(i, j)] * c_j[j] for i in I for j in J) >= pulp.lpSum(
    d_i[i] * lambda_i[i] for i in I
)

# Solve the model
model.solve()
# Results
print(f"Status: {pulp.LpStatus[model.status]}")
print(f"Objective function: {pulp.value(model.objective)}")
df = pd.DataFrame(
    [[pulp.value(x[(i, j)]) for j in J] for i in I],
    columns=[f"Facility {j}" for j in J],
    index=[f"Municipality {i}" for i in I],
)
df["demand"] = d_i
df["satisfied"] = sum(df[f"Facility {j}"] * c_j[j] for j in J)
df["cost"] = sum(df[f"Facility {j}"] * f_j[j] for j in J)
df["comb"] = df.apply(lambda x: tuple(x[: len(J)]), axis=1)

display(df.comb.unique().size, df.comb.unique())
display(df)


Parameters: 
    c_j: [1074. 2418. 9672.] 
    f_j: [7.31159925e+08 1.12921261e+09 4.51685042e+09] 
    p_i: [   7163   30354   11843    5510   31552   11113   32841    1967   15092
    5678    7483   15896   25201  812176   75067   13083   73141    2737
   25904    1233   13291   53609   34847   45530   58522   15576    6293
   57051   68822    5235   40404  585858   56894    7865    7461    7987
    8478    8715   44457    3239   29596   54078    4955   20676  118451
   22912 7929539   10422   25694    4057   10691    8579   17234   19170
    4306    4219    5885   12861   16565   13928  135688   34993   19664
    8699   19748   10165   13266   12177   24586    5404    6405   29016
    7789   38586  104302   14480    8648    6203   20594    2002  123329
   10386   63383    5550  121546    6555    1047    9752    3220  339490
    5422   12500   54575  544132    1875   43557  299348  163306   10592
    2063   35936   29540   15806   23546   77793   14454    5844   13986
    2319   6223

11

array([(1.0, 0.0, 0.0), (0.0, 0.0, 1.0), (0.0, 3.0, 0.0),
       (0.0, 31.0, 0.0), (3.0, 0.0, 0.0), (0.0, 1.0, 0.0),
       (0.0, 6.0, 0.0), (0.0, 9.0, 0.0), (0.0, 2.0, 0.0), (0.0, 5.0, 0.0),
       (0.0, 10.0, 0.0)], dtype=object)

Unnamed: 0,Facility 0,Facility 1,Facility 2,demand,satisfied,cost,comb
Municipality 0,1.0,0.0,0.0,43.932040,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 1,1.0,0.0,0.0,186.166849,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 2,1.0,0.0,0.0,72.635369,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 3,1.0,0.0,0.0,33.793877,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 4,1.0,0.0,0.0,193.514411,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
...,...,...,...,...,...,...,...
Municipality 944,1.0,0.0,0.0,84.834284,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 945,1.0,0.0,0.0,67.673618,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 946,1.0,0.0,0.0,320.054387,1074.0,731159925.0,"(1.0, 0.0, 0.0)"
Municipality 947,1.0,0.0,0.0,132.746764,1074.0,731159925.0,"(1.0, 0.0, 0.0)"


## 3. Set facility capacity

We have m types of facilities with different capacities, the proposal is to use the data generated in the previous step to determine the capacity of each facility type in order to satisfy the demand.
Machine Learning algorithms will be used to Classify the type of facility that will be needed for each municipality.
- Decision Tree.
- Linear Discriminant Analysis.
- Logistic Regression.
- Support Vector Machine.

and deep learning to create a new model. The best model will be used to determine the capacity of each facility type.
