In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from utils import *

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import folium

import xgboost as xgb

from sklearn.metrics import root_mean_squared_error

import optuna
from optuna.trial import TrialState

#### Carregando o dataset

In [3]:
dir = 'balanced_data'

In [4]:
df_train = pd.read_csv(f'../data/{dir}/train.csv')
df_val = pd.read_csv(f'../data/{dir}/validation.csv')
df_test = pd.read_csv(f'../data/{dir}/test.csv')

In [5]:
df_train.head()

Unnamed: 0,ponto_id,rssi_1_1,rssi_1_2,rssi_1_3,rssi_2_1,rssi_2_2,rssi_2_3,rssi_3_1,rssi_3_2,rssi_3_3,delay_1,delay_2,delay_3,lat,lon,indoor
0,6659,0.490595,0.592069,0.473746,0.305143,0.748657,0.867459,0.332333,0.426865,0.306476,1.0,0.285714,0.714286,-8.05889,-34.94626,False
1,4799,0.316859,0.0,0.303716,0.0,0.209296,0.239055,0.124152,0.277901,0.0,0.666667,0.428571,0.285714,-8.055506,-34.951693,True
2,7057,0.420978,0.374651,0.412409,0.261084,0.398934,0.492072,0.773747,0.712558,0.294188,0.666667,0.571429,0.142857,-8.05558,-34.95459,False
3,7408,0.472425,0.307408,0.482708,0.224991,0.264643,0.472029,0.816936,0.455887,0.855267,0.666667,0.857143,0.0,-8.05487,-34.9578,False
4,6959,0.534422,0.307408,0.53942,0.25929,0.264643,0.523022,0.389744,0.51542,0.67115,0.833333,0.857143,0.0,-8.05576,-34.9589,False


In [6]:
# Confirmando que não há data leak
assert len(set(df_train['ponto_id']).intersection(set(df_val['ponto_id']))) == 0
assert len(set(df_train['ponto_id']).intersection(set(df_test['ponto_id']))) == 0
assert len(set(df_val['ponto_id']).intersection(set(df_test['ponto_id']))) == 0

In [7]:
X_train = df_train.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_train = df_train[['lat', 'lon']].values

X_val = df_val.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_val = df_val[['lat', 'lon']].values

X_test = df_test.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_test = df_test[['lat', 'lon']].values

#### Criando o modelo base

In [8]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [9]:
params = {
    'max_depth': 10,
    'eta': 0.2,  #Learning Rate
    'objective': 'reg:squarederror', # Problema de regressão
    'eval_metric': 'rmse',
}

In [10]:
model = xgb.train(
    params,
    dtrain,
    100,
    evals=[(dval, 'validation')],
    early_stopping_rounds=5
)

[0]	validation-rmse:10.75968
[1]	validation-rmse:8.60798
[2]	validation-rmse:6.88658
[3]	validation-rmse:5.50942
[4]	validation-rmse:4.40766
[5]	validation-rmse:3.52622
[6]	validation-rmse:2.82106
[7]	validation-rmse:2.25691
[8]	validation-rmse:1.80557
[9]	validation-rmse:1.44450
[10]	validation-rmse:1.15563
[11]	validation-rmse:0.92453
[12]	validation-rmse:0.73964
[13]	validation-rmse:0.59173
[14]	validation-rmse:0.47340
[15]	validation-rmse:0.37873
[16]	validation-rmse:0.30299
[17]	validation-rmse:0.24241
[18]	validation-rmse:0.19393
[19]	validation-rmse:0.15516
[20]	validation-rmse:0.12414
[21]	validation-rmse:0.09933
[22]	validation-rmse:0.07948
[23]	validation-rmse:0.06360
[24]	validation-rmse:0.05089
[25]	validation-rmse:0.04072
[26]	validation-rmse:0.03259
[27]	validation-rmse:0.02608
[28]	validation-rmse:0.02087
[29]	validation-rmse:0.01671
[30]	validation-rmse:0.01338
[31]	validation-rmse:0.01071
[32]	validation-rmse:0.00858
[33]	validation-rmse:0.00687
[34]	validation-rmse:0.

#### Avaliando o modelo

In [11]:
y_pred = model.predict(dtest)
rmse = np.sqrt(root_mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 0.01406487543374029


In [12]:
accuracy = calculate_accuracy(y_pred, y_test)
print(f"Acurácia: {accuracy:.3f}")

Acurácia: 0.639


In [13]:
map = plot_folium_map(y_test, y_pred, connect_point=True)
map

#### Avaliando o modelos para pontos específicos 

In [14]:
# Prevendo o valor de uma instância específica
idx = 10

X_unit = X_test[idx].reshape(1, -1)
y_unit = np.array([y_test[idx]])
dunit = xgb.DMatrix(X_unit, label=y_unit)
y_pred_unit = model.predict(dunit)

print(f"Pred Point: {y_pred_unit[0]}")
print(f"Real Point: {y_unit[0]}")

Pred Point: [ -8.056172 -34.95622 ]
Real Point: [ -8.0563 -34.9565]


In [15]:
map = plot_folium_map(y_unit, y_pred_unit)
map

### Optuna