In [1]:
from utils import *

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import folium

import xgboost as xgb

from sklearn.metrics import root_mean_squared_error

#### Carregando o dataset

In [2]:
df_train = pd.read_csv('../data/treated_data/train.csv')
df_val = pd.read_csv('../data/treated_data/validation.csv')
df_test = pd.read_csv('../data/treated_data/test.csv')

In [3]:
df_train.head()

Unnamed: 0,ponto_id,rssi_1_1,rssi_1_2,rssi_1_3,rssi_2_1,rssi_2_2,rssi_2_3,rssi_3_1,rssi_3_2,rssi_3_3,delay_1,delay_2,delay_3,lat,lon,indoor
0,8790,0.5592,0.398652,0.703014,0.267376,0.426916,0.454187,0.31728,0.311331,0.301273,0.333333,0.571429,0.428571,-8.05009,-34.95312,0
1,3227,0.2467,0.0,0.332592,0.0,0.360645,0.342995,0.16919,0.339785,0.0,0.666667,0.428571,0.428571,-8.055834,-34.951362,1
2,4180,0.1842,0.0,0.168521,0.14599,0.318269,0.288125,0.175386,0.277219,0.0,0.666667,0.428571,0.285714,-8.055424,-34.951597,1
3,8922,0.707155,0.446022,0.718534,0.490706,0.361142,0.451422,0.326812,0.3313,0.299672,0.166667,0.571429,0.428571,-8.04953,-34.95329,0
4,1117,0.12194,0.157226,0.161643,0.088497,0.252093,0.25632,0.087576,0.252854,0.0,0.666667,0.428571,0.285714,-8.05538,-34.951736,1


In [4]:
# Confirmando que não há data leak
assert len(set(df_train['ponto_id']).intersection(set(df_val['ponto_id']))) == 0
assert len(set(df_train['ponto_id']).intersection(set(df_test['ponto_id']))) == 0
assert len(set(df_val['ponto_id']).intersection(set(df_test['ponto_id']))) == 0

In [5]:
X_train = df_train.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_train = df_train[['lat', 'lon']].values

X_val = df_val.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_val = df_val[['lat', 'lon']].values

X_test = df_test.drop(['ponto_id', 'indoor', 'lat', 'lon'], axis=1).values
y_test = df_test[['lat', 'lon']].values

#### Criando o modelo base

In [6]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [7]:
params = {
    'max_depth': 5,
    'eta': 0.03,  #Learning Rate
    'objective': 'reg:squarederror', # Problema de regressão
}

In [8]:
model = xgb.train(
    params,
    dtrain,
    500,
    evals=[(dval, 'validation')],
    early_stopping_rounds=5
)

[0]	validation-rmse:13.04568
[1]	validation-rmse:12.65440
[2]	validation-rmse:12.27485
[3]	validation-rmse:11.90669
[4]	validation-rmse:11.54956
[5]	validation-rmse:11.20315
[6]	validation-rmse:10.86713
[7]	validation-rmse:10.54119
[8]	validation-rmse:10.22502
[9]	validation-rmse:9.91834
[10]	validation-rmse:9.62085
[11]	validation-rmse:9.33229
[12]	validation-rmse:9.05238
[13]	validation-rmse:8.78087
[14]	validation-rmse:8.51750
[15]	validation-rmse:8.26204
[16]	validation-rmse:8.01423
[17]	validation-rmse:7.77386
[18]	validation-rmse:7.54069
[19]	validation-rmse:7.31452
[20]	validation-rmse:7.09513
[21]	validation-rmse:6.88233
[22]	validation-rmse:6.67590
[23]	validation-rmse:6.47567
[24]	validation-rmse:6.28144
[25]	validation-rmse:6.09304
[26]	validation-rmse:5.91029
[27]	validation-rmse:5.73302
[28]	validation-rmse:5.56107
[29]	validation-rmse:5.39427
[30]	validation-rmse:5.23248
[31]	validation-rmse:5.07554
[32]	validation-rmse:4.92331
[33]	validation-rmse:4.77564
[34]	validation

#### Avaliando o modelo

In [9]:
y_pred = model.predict(dtest)
rmse = np.sqrt(root_mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 0.015107480986669388


In [10]:
map = plot_folium_map(y_test, y_pred)
map

#### Avaliando o modelos para pontos específicos 

In [11]:
# Prevendo o valor de uma instância específica
idx = 10

X_unit = X_test[idx].reshape(1, -1)
y_unit = np.array([y_test[idx]])
dunit = xgb.DMatrix(X_unit, label=y_unit)
y_pred_unit = model.predict(dunit)

print(f"Pred Point: {y_pred_unit[0]}")
print(f"Real Point: {y_unit[0]}")

Pred Point: [ -8.055453 -34.951557]
Real Point: [ -8.055424 -34.951597]


In [12]:
map = plot_folium_map(y_unit, y_pred_unit)
map