# Importe De Librerías

In [97]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn

from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

# Regresión

### Lectura de archivos

In [89]:
ds_regresion = pd.read_csv("properati_argentina_reducido_train.csv")
test = pd.read_csv("properati_argentina_reducido_test.csv")

Unnamed: 0.1,Unnamed: 0,latitud,longitud,property_rooms,property_surface_total,property_price
0,442006,-34.575825,-58.434324,3.0,84.0,140000.0
1,60322,-34.569851,-58.478504,3.0,75.0,170000.0
2,341795,-34.592793,-58.384974,4.0,220.0,750000.0
3,99195,-34.630323,-58.486175,3.0,74.0,130000.0
4,312329,-34.585164,-58.421009,1.0,20.0,55000.0
...,...,...,...,...,...,...
13924,273859,-34.611250,-58.360528,2.0,82.0,450000.0
13925,333920,-34.585164,-58.421009,1.0,31.0,67960.0
13926,123742,-34.598707,-58.442244,5.0,90.0,180000.0
13927,133436,-34.572635,-58.445869,4.0,94.0,250000.0


## KNN

### Preparo el dataset


In [90]:
variables_a_eliminar = ["Unnamed: 0"]

ds_knn = ds_regresion.copy()
ds_test_knn = test.copy()

ds_knn.drop(columns=variables_a_eliminar, inplace=True)
ds_test_knn.drop(columns = variables_a_eliminar, inplace=True)

#### Min-Max

In [91]:
scaler = MinMaxScaler()
ds_knn["property_surface_total"] = scaler.fit_transform(ds_knn["property_surface_total"].to_frame())
ds_knn["property_price"] = scaler.fit_transform(ds_knn["property_price"].to_frame())
ds_knn["property_rooms"] = scaler.fit_transform(ds_knn["property_rooms"].to_frame())
ds_knn

Unnamed: 0,latitud,longitud,property_rooms,property_surface_total,property_price
0,-34.559318,-58.445475,0.051282,0.000337,0.017585
1,-34.610368,-58.441225,0.051282,0.000344,0.015664
2,-34.616446,-58.379481,0.000000,0.000181,0.006357
3,-34.619406,-58.451999,0.000000,0.000169,0.007580
4,-34.605707,-58.415998,0.025641,0.000163,0.006367
...,...,...,...,...,...
55711,-34.592190,-58.404974,0.051282,0.000350,0.019202
55712,-34.584945,-58.399825,0.000000,0.000169,0.012431
55713,-34.611670,-58.430043,0.025641,0.000227,0.005760
55714,-34.607711,-58.390811,0.025641,0.000145,0.006468


### Modelo

#### Optimizacion de parametros con Random search

In [93]:
x_train = ds_knn.drop(columns=["property_price"])
y_train = ds_knn["property_price"]

In [94]:
#Grilla de Parámetros
params_grid={ 'n_neighbors':range(1,30), 
              'weights':['distance','uniform'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'],
              'metric':['euclidean','manhattan','chebyshev']
             }

#Metrica que quiero optimizar MSE
scorer_fn = make_scorer(sklearn.metrics.mean_squared_error)

#Clasificador KNN
knn=KNeighborsRegressor()

#Random Search con 10 Folds y 10 iteraciones
rand = RandomizedSearchCV(knn, params_grid, cv=10, scoring=scorer_fn, n_iter=10, random_state=5)

rand.fit(x_train, y_train)

In [95]:
#Mejores hiperparametros
print(rand.best_params_)

#Mejor métrica
mse = rand.best_score_
print("RMSE en datos de entrnamiento: " + str(np.sqrt(mse)))


{'weights': 'uniform', 'n_neighbors': 24, 'metric': 'chebyshev', 'algorithm': 'ball_tree'}
RMSE en datos de entrnamiento: 0.010006008557663275


In [96]:
#Mejor estimador
best_knn=rand.best_estimator_

x_test_knn = ds_test_knn.drop(columns=["property_price"])
y_test_knn = ds_test_knn["property_price"]

y_pred_knn = best_knn.predict(x_test_knn)

In [98]:
#Metricas para evaluar modelos
from sklearn import metrics

#Mean Square Error
mse = metrics.mean_squared_error(
        y_true  = y_test_knn,
        y_pred  = y_pred_knn,
        squared = True
       )

print(f"El error (mse) de test es: {mse}")

#Root Mean Square Error
rmse = metrics.mean_squared_error(
        y_true  = y_test_knn,
        y_pred  = y_pred_knn,
        squared = False
       )

print(f"El error (rmse) de test es: {rmse}")

El error (mse) de test es: 72613784358.15599
El error (rmse) de test es: 269469.4497677909


## XGBoost

In [None]:
ds_prop_XGBoost_train = ds_regresion.copy()
ds_prop_XGBoost_test = test.copy()

ds_prop_XGBoost_train = ds_prop_XGBoost_train.dropna()
ds_prop_XGBoost_test = ds_prop_XGBoost_test.dropna()

Realizamos la ingeniería de caracteristicas para cada dataset

In [None]:
#Eliminamos las features sin importancia

variables_eliminadas = ["start_date", "end_date", "created_on", "property_title", "place_l2", "property_currency", "operation", "geometry", "cluster"]
ds_prop_XGBoost_train = ds_prop_XGBoost_train.drop(variables_eliminadas, axis='columns', inplace=False)

variables_eliminadas = ["start_date", "end_date", "created_on", "property_title", "place_l2", "property_currency", "operation"]
ds_prop_XGBoost_test = ds_prop_XGBoost_test.drop(variables_eliminadas, axis='columns', inplace=False)


In [None]:
#One Hot Encoding para variables categoricas

variables_reemplazadas = ["property_type", "place_l3"]
ds_prop_XGBoost_train = pd.get_dummies(ds_prop_XGBoost_train, columns=variables_reemplazadas, drop_first=True)
ds_prop_XGBoost_test = pd.get_dummies(ds_prop_XGBoost_test, columns=variables_reemplazadas, drop_first=True)

#Escalamos las variables para que no tengan mayor peso
scaler = MinMaxScaler()

ds_prop_XGBoost_train["property_surface_total"] = scaler.fit_transform(ds_prop_XGBoost_train["property_surface_total"].to_frame())
ds_prop_XGBoost_train["longitud"] = scaler.fit_transform(ds_prop_XGBoost_train["longitud"].to_frame())
ds_prop_XGBoost_train["latitud"] = scaler.fit_transform(ds_prop_XGBoost_train["latitud"].to_frame())

ds_prop_XGBoost_test["property_surface_total"] = scaler.fit_transform(ds_prop_XGBoost_test["property_surface_total"].to_frame())
ds_prop_XGBoost_test["longitud"] = scaler.fit_transform(ds_prop_XGBoost_test["longitud"].to_frame())
ds_prop_XGBoost_test["latitud"] = scaler.fit_transform(ds_prop_XGBoost_test["latitud"].to_frame())




In [None]:
#Hacemos division Train-Test
features = ['property_rooms','property_bedrooms','property_surface_total','property_surface_covered','latitud','longitud', 'property_type_PH', 'property_type_Departamento']
target = ["property_price"]

x_train = ds_prop_XGBoost_train[features]
x_test = ds_prop_XGBoost_test[features]

y_train = ds_prop_XGBoost_train[target]
y_test = ds_prop_XGBoost_test[target]

Buscamos los mejores hiperparametros usando Random CV. Usamos Random CV para hallar los hiperparametros que optimizen el MSE, ya que es la medida que nos interesa a al hora de hacer regresion. Queremos calcular el error medio de las predicciones

In [None]:
##KFOLD CV Random Search para buscar el mejor arbol (los mejores atributos, hiperparametros,etc)

#Cantidad de combinaciones que quiero porbar
n=5

#Conjunto de parámetros que quiero usar
params_grid = {'criterion':['gini','entropy'],
               #'min_samples_leaf':list(range(1,10)),
               #'min_samples_split': list(range(2,20)),
               'ccp_alpha':np.linspace(0,0.05,n), 
               'max_depth':list(range(0, 15)),
               'random_state':list(range(0,6))}
                
#Cantidad de splits para el Cross Validation
folds=20

#Kfold estratificado
#kfoldcv = StratifiedKFold(n_splits=folds)

#Regresor
xgb_model_rd_search = xgb.XGBRegressor()

#Metrica que quiero optimizar MSE
scorer_fn = make_scorer(sklearn.metrics.mean_squared_error)

#Random Search Cross Validation
randomcv = RandomizedSearchCV(estimator=xgb_model_rd_search,
                              param_distributions = params_grid,
                              scoring=scorer_fn,
                              n_iter=n, cv=folds) 

#GridSearch CV
#gridcv = GridSearchCV(estimator=xgb_model_rd_search,
#                      param_grid=params_grid,
#                      scoring=scorer_fn,
#                      return_train_score='True') 

#Busco los hiperparamtros que optimizan F1 Score
randomcv.fit(x_train,y_train)

Parameters: { "ccp_alpha", "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: {

20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Carreño\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Carreño\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\xgboost\core.py", line 575, in inner_f
    return f(**kwargs)
  File "C:\Users\Carreño\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\l

Parameters: { "ccp_alpha", "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [None]:
#Mejores hiperparametros
print(randomcv.best_params_)

#Mejor métrica
mse = randomcv.best_score_
print("RMSE en datos de entrnamiento: " + str(np.sqrt(mse)))

{'random_state': 0, 'max_depth': 3, 'criterion': 'entropy', 'ccp_alpha': 0.0}
RMSE en datos de entrnamiento: 101528.17304703005


Entrenamos el modelo

In [None]:
xgb_model = xgb.XGBRegressor().set_params(**randomcv.best_params_)
xgb_model.fit(x_train, y_train)

Parameters: { "ccp_alpha", "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Medimos que tan precisas fueron las predicciones usando RMSE. La ventaja de usar RMSE en vez de MSE es que el valor de RMSE está en las mismas unidades que la variable target. De esta forma es más fácil dimensionar que tan preciso es el modelo creado

In [None]:
from sklearn.metrics import mean_squared_error


y_pred = xgb_model.predict(x_test)
#MSE 121489.55

rmse = mean_squared_error(y_test, y_pred, squared=False)
#print("MSE: %.2f" % (mse**(1/2.0)))
print("RMSE en datos de test: " + str(rmse))

RMSE en datos de test: 184088.99927721854


Vemos que la performance de entrenamiento y la del conjunto de evaluación es similar. Esto significa que no hay overfitting, es decir, el modelo no se aprende de 'memoria' los datos de entrenamiento y no pierde precision al realizar las predicciones en el conjunto de evaluación. 

In [None]:
#Atributos considerados y su importancia

features = ["latitud", "longitud", "property_rooms", "propperty_bedrooms", "property_surface_total", "property_surface_covered", "property_type_PH", "property_type_Departamento"]
feat_imps = xgb_model.feature_importances_

for feat_imp,feat in sorted(zip(feat_imps,features)):
  if feat_imp>0:
    print('{}: {}'.format(feat,feat_imp))


latitud: 0.014410820789635181
longitud: 0.014815220609307289
property_surface_covered: 0.038302745670080185
property_surface_total: 0.055447451770305634
property_rooms: 0.08408662676811218
property_type_PH: 0.08468924462795258
propperty_bedrooms: 0.33298277854919434
property_type_Departamento: 0.37526506185531616
