# Uncapacitated Facility Location Problem

In [1]:
import os
import sys
import time
import pandas as pd
import numpy as np
sys.path.append("..")
from lp.uflp import UFLP
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

## Load the data

In [2]:
# import the data and parameters and check the data

data_path = 'data/'
start_time = time.time()
# ====Parameters==== #
parameters = pd.read_csv(os.path.join(data_path, 'parametros.csv'))
# 1. food_per_person_per_day in tons per day
food_per_person_per_day = float(parameters.loc[
    parameters['parametro'] == 'comida_por_persona_en_toneladas'].values[0][1])
# 2. transport_cost_per_ton_per_km in COP per ton per km
transport_cost_per_ton_per_km = float(
    parameters.loc[parameters['parametro'] == 'costo_de_transporte_por_tonelada_por_kilomentro'].values[0][1])

print(f'parameters: \n    food_per_person_per_day: {food_per_person_per_day} \n    transport_cost_per_ton_per_km: {transport_cost_per_ton_per_km}\n')
# ====Parameters==== #

# ====importData==== #
# 1. population, from data/municipios_procesado.csv
population = pd.read_csv(os.path.join(data_path, 'municipios_procesado.csv'), index_col=3)
# 2. distance, from data/distance_matrix_final.csv
distance = pd.read_csv(os.path.join(data_path, 'distance_matrix_final.csv'), index_col=0)
# 3. warehouses, from data/almacenes.csv
warehouses = pd.read_csv(os.path.join(data_path, 'almacenes.csv'))
# ====importData==== #

# ====DataProcessing===== #
# fill the nan values in population with the minimum '2024' from the departamento of Chocó for the columns 22:
population.loc[population.isna().any(axis=1), population.columns[18:]] = population[population['2024'] == population.loc[population['departamento'] == 'Chocó']['2024'].min()].iloc[0, 18:].values
# drop the municipalities with nan values in the first column of the distance matrix
distance = distance.dropna(subset=[distance.columns[0]], axis=0)
distance = distance.dropna(subset=[distance.index[0]], axis=1)
# select only the rows in population dpmp that the index is in distance
population = population.loc[distance.index]
# turn the columns of distance into integers
distance.columns = distance.columns.astype(int)
print(f'data information: \n    population: {population.shape} \n    distance: {distance.shape} \n    warehouses: \n {warehouses} \n')
# ====DataProcessing===== #

# ====DataInformation==== #
# Memory usage of the data
print(f'population memory usage: {population.memory_usage().sum()/1024**2:.2f} MB')
print(f'distance memory usage: {distance.memory_usage().sum()/1024**2:.2f} MB')
print(f'warehouses memory usage: {warehouses.memory_usage().sum()/1024**2:.2f} MB')
# ====DataInformation==== #

# ====DataChecking==== #
# Test to the data if needed
# ====DataChecking==== #
print(f'Elapsed time: {time.time() - start_time:.2f} s')

parameters: 
    food_per_person_per_day: 0.00087617 
    transport_cost_per_ton_per_km: 3364.0

data information: 
    population: (1117, 71) 
    distance: (1117, 1117) 
    warehouses: 
    Type  capacity_ton        cost
0     1          1074   731159925
1     2          2418  1129212606
2     3          9672  4516850424 

population memory usage: 0.61 MB
distance memory usage: 9.56 MB
warehouses memory usage: 0.00 MB
Elapsed time: 0.29 s


## 1. Demand Forecast

we are using the data from the population dataset that has the colombian census information since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.

First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
- Multiple Linear Regression.
- Regression Tree.
- Support Vector Machine.
- Random Forest Regression.

Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.

In [3]:
# ## 1. Demand Forecast

# we are using the data from the population dataset that has the colombian census information for the population of 1123 municipalities since 1985 to 2035, the current year is 2024 and the last census was performed in 2018, the data was taken on december 2023 from the DANE web page.
# aditionally we have data of the latitude and longitude of the municipalities.
# First we need to check the current forecast, then use 4 ML algorithms and Deep Learning to create a new model. The Machine Learning algorithms are:
# - Multiple Linear Regression.
# - Regression Tree.
# - Support Vector Machine.
# - Random Forest Regression.

# Then, we need the Mean Absolute Error (MAE) to compare the models. The best model will be used to forecast the demand for the next 30 years.
data = population.copy().iloc[:, 20:]
data = data.transpose()
# train with a random sample of 80% of the data
train = data.sample(frac=0.8, random_state=0)
test = data.drop(train.index)
# 1.1. Multiple Linear Regression
start_time = time.time()
mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = LinearRegression()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.1. Multiple Linear Regression: {time.time()-start_time:.2f} s")
print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
print(f'    R2 Score: {np.mean(r2_scores)}')

# 1.2. Regression Tree
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = DecisionTreeRegressor()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.2. Regression Tree: {time.time()-start_time:.2f} s")
print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
print(f'    R2 Score: {np.mean(r2_scores)}')

# 1.3. Support Vector Machine
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = svm.SVR()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.3. Support Vector Machine: {time.time()-start_time:.2f} s")
print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
print(f'    R2 Score: {np.mean(r2_scores)}')

# 1.4. Random Forest Regression
start_time = time.time()

mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
start_time = time.time()

for i in data.columns:
    model = RandomForestRegressor()
    model.fit(np.array(train.index).reshape(-1, 1), train[i])
    predictions = model.predict(np.array(test.index).reshape(-1, 1))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.4. Random Forest Regression:  {time.time()-start_time:.2f} s")
print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
print(f'    R2 Score: {np.mean(r2_scores)}')

# 1.5 Deep Learning
start_time = time.time()
# for the deep learning model we are going to use a simple neural network with 3 layers using the Keras library
# we are going to use the same data as before
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
mean_absolute_errors = []
mean_squared_errors = []
r2_scores = []
# for each column in the data
for i in data.columns:
    model = Sequential()
    model.add(Dense(12, input_dim=1, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(np.array(train.index, dtype=float), train[i], epochs=150, batch_size=10, verbose=0)
    predictions = model.predict(np.array(test.index, dtype=float))
    mean_absolute_errors.append(mean_absolute_error(test[i], predictions))
    mean_squared_errors.append(mean_squared_error(test[i], predictions))
    r2_scores.append(r2_score(test[i], predictions))
print(f"1.5. Deep Learning:  {time.time()-start_time:.2f} s")
print(f'    Mean Absolute Error: {np.mean(mean_absolute_errors)}')
print(f'    Mean Squared Error: {np.mean(mean_squared_errors)}')
print(f'    R2 Score: {np.mean(r2_scores)}')


1.1. Multiple Linear Regression: 2.31 s
    Mean Absolute Error: 1983.0903511087888
    Mean Squared Error: 271714255.3285868
    R2 Score: 0.6550556575024784
1.2. Regression Tree: 1.91 s
    Mean Absolute Error: 506.4348254252462
    Mean Squared Error: 12224571.61817368
    R2 Score: 0.9839632368722134
1.3. Support Vector Machine: 2.14 s
    Mean Absolute Error: 8518.465071851242
    Mean Squared Error: 3061127831.2831564
    R2 Score: -0.22744499257749734
1.4. Random Forest Regression:  118.79 s
    Mean Absolute Error: 373.8657493285584
    Mean Squared Error: 9905412.831957828
    R2 Score: 0.9844064617241319


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
