# Importe De Librerías

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
import sklearn

from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

# Regresión

### Lectura de archivos

In [45]:
ds_regresion = pd.read_csv("properati_argentina_reducido_train.csv")
test = pd.read_csv("properati_argentina_reducido_test.csv")

variables_a_eliminar = ["Unnamed: 0"]

ds_regresion.drop(columns=variables_a_eliminar, inplace=True)
test.drop(columns = variables_a_eliminar, inplace=True)

## KNN

### Preparo el dataset


In [46]:
ds_train_knn = ds_regresion.copy()
ds_test_knn = test.copy()

#### Min-Max

In [47]:
scaler = MinMaxScaler()
ds_train_knn["property_surface_total"] = scaler.fit_transform(ds_train_knn["property_surface_total"].to_frame())
ds_train_knn["property_rooms"] = scaler.fit_transform(ds_train_knn["property_rooms"].to_frame())
ds_train_knn["longitud"] = scaler.fit_transform(ds_train_knn["longitud"].to_frame())
ds_train_knn["latitud"] = scaler.fit_transform(ds_train_knn["latitud"].to_frame())


ds_test_knn["property_surface_total"] = scaler.fit_transform(ds_test_knn["property_surface_total"].to_frame())
ds_test_knn["property_rooms"] = scaler.fit_transform(ds_test_knn["property_rooms"].to_frame())
ds_test_knn["longitud"] = scaler.fit_transform(ds_test_knn["longitud"].to_frame())
ds_test_knn["latitud"] = scaler.fit_transform(ds_test_knn["latitud"].to_frame())

#### One Hot Encoding

In [48]:
ds_train_knn = pd.get_dummies(ds_train_knn, columns=["property_type"], drop_first=True)
ds_test_knn = pd.get_dummies(ds_test_knn, columns=["property_type"], drop_first=True)

In [49]:
ds_train_knn.head()

Unnamed: 0,latitud,longitud,property_rooms,property_surface_total,property_price,property_type_Departamento,property_type_PH
0,0.856747,0.455186,0.051282,0.000337,179000.0,1,0
1,0.543607,0.477971,0.051282,0.000344,160000.0,1,0
2,0.506321,0.808948,0.0,0.000181,67900.0,1,0
3,0.488165,0.420215,0.0,0.000169,80000.0,1,0
4,0.572198,0.613199,0.025641,0.000163,68000.0,1,0


In [50]:
ds_test_knn.head()

Unnamed: 0,latitud,longitud,property_rooms,property_surface_total,property_price,property_type_Departamento,property_type_PH
0,0.748053,0.510308,0.333333,0.00054,140000.0,1,0
1,0.785849,0.272786,0.333333,0.000474,170000.0,1,0
2,0.640697,0.775623,0.5,0.001533,750000.0,1,0
3,0.403258,0.231545,0.333333,0.000467,130000.0,1,0
4,0.688966,0.58189,0.0,7.3e-05,55000.0,1,0


### Modelo

#### Optimizacion de parametros con Random search

In [51]:
x_train = ds_train_knn.drop(columns=["property_price"])
y_train = ds_train_knn["property_price"]

In [52]:
#Grilla de Parámetros
params_grid={ 'n_neighbors':range(1,30), 
              'weights':['distance','uniform'],
              'algorithm':['ball_tree', 'kd_tree', 'brute'],
              'metric':['euclidean','manhattan','chebyshev']
             }

#Metrica que quiero optimizar MSE
scorer_fn = make_scorer(sklearn.metrics.mean_squared_error)

#Clasificador KNN
knn=KNeighborsRegressor()

#Random Search con 10 Folds y 10 iteraciones
rand = RandomizedSearchCV(knn, params_grid, cv=10, scoring=scorer_fn, n_iter=10, random_state=5)

rand.fit(x_train, y_train)

In [53]:
#Mejores hiperparametros
print(rand.best_params_)

#Mejor métrica
mse = rand.best_score_
print("RMSE en datos de entrnamiento: " + str(np.sqrt(mse)))


{'weights': 'uniform', 'n_neighbors': 24, 'metric': 'chebyshev', 'algorithm': 'ball_tree'}
RMSE en datos de entrnamiento: 96879.1490542756


In [54]:
#Mejor estimador
best_knn=rand.best_estimator_

x_test_knn = ds_test_knn.drop(columns=["property_price"])
y_test_knn = ds_test_knn["property_price"]

y_pred_knn = best_knn.predict(x_test_knn)

In [55]:
#Metricas para evaluar modelos
from sklearn import metrics

#Mean Square Error
mse = metrics.mean_squared_error(
        y_true  = y_test_knn,
        y_pred  = y_pred_knn,
        squared = True
       )

print(f"El error (mse) de test es: {mse}")

#Root Mean Square Error
rmse = metrics.mean_squared_error(
        y_true  = y_test_knn,
        y_pred  = y_pred_knn,
        squared = False
       )

print(f"El error (rmse) de test es: {rmse}")

El error (mse) de test es: 91218624242.87418
El error (rmse) de test es: 302024.21135212685


## XGBoost

In [56]:
ds_prop_XGBoost_train = ds_regresion.copy()
ds_prop_XGBoost_test = test.copy()

ds_prop_XGBoost_train = ds_prop_XGBoost_train.dropna()
ds_prop_XGBoost_test = ds_prop_XGBoost_test.dropna()

Realizamos la ingeniería de caracteristicas para cada dataset

In [57]:
#One Hot Encoding para variables categoricas

variables_reemplazadas = ["property_type"]
ds_prop_XGBoost_train = pd.get_dummies(ds_prop_XGBoost_train, columns=variables_reemplazadas, drop_first=True)
ds_prop_XGBoost_test = pd.get_dummies(ds_prop_XGBoost_test, columns=variables_reemplazadas, drop_first=True)

#Escalamos las variables para que no tengan mayor peso
scaler = MinMaxScaler()

ds_prop_XGBoost_train["property_surface_total"] = scaler.fit_transform(ds_prop_XGBoost_train["property_surface_total"].to_frame())
ds_prop_XGBoost_train["longitud"] = scaler.fit_transform(ds_prop_XGBoost_train["longitud"].to_frame())
ds_prop_XGBoost_train["latitud"] = scaler.fit_transform(ds_prop_XGBoost_train["latitud"].to_frame())

ds_prop_XGBoost_test["property_surface_total"] = scaler.fit_transform(ds_prop_XGBoost_test["property_surface_total"].to_frame())
ds_prop_XGBoost_test["longitud"] = scaler.fit_transform(ds_prop_XGBoost_test["longitud"].to_frame())
ds_prop_XGBoost_test["latitud"] = scaler.fit_transform(ds_prop_XGBoost_test["latitud"].to_frame())




In [58]:
#Hacemos division Train-Test
features = ['property_rooms','property_surface_total','latitud','longitud', 'property_type_PH', 'property_type_Departamento']
target = ["property_price"]

x_train = ds_prop_XGBoost_train[features]
x_test = ds_prop_XGBoost_test[features]

y_train = ds_prop_XGBoost_train[target]
y_test = ds_prop_XGBoost_test[target]

Buscamos los mejores hiperparametros usando Random CV. Usamos Random CV para hallar los hiperparametros que optimizen el MSE, ya que es la medida que nos interesa a al hora de hacer regresion. Queremos calcular el error medio de las predicciones

In [59]:
##KFOLD CV Random Search para buscar el mejor arbol (los mejores atributos, hiperparametros,etc)

#Cantidad de combinaciones que quiero porbar
n=5

#Conjunto de parámetros que quiero usar
params_grid = {'criterion':['gini','entropy'],
               #'min_samples_leaf':list(range(1,10)),
               #'min_samples_split': list(range(2,20)),
               'ccp_alpha':np.linspace(0,0.05,n), 
               'max_depth':list(range(0, 15)),
               'random_state':list(range(0,6))}
                
#Cantidad de splits para el Cross Validation
folds=20

#Kfold estratificado
#kfoldcv = StratifiedKFold(n_splits=folds)

#Regresor
xgb_model_rd_search = xgb.XGBRegressor()

#Metrica que quiero optimizar MSE
scorer_fn = make_scorer(sklearn.metrics.mean_squared_error)

#Random Search Cross Validation
randomcv = RandomizedSearchCV(estimator=xgb_model_rd_search,
                              param_distributions = params_grid,
                              scoring=scorer_fn,
                              n_iter=n, cv=folds) 


#Busco los hiperparamtros que optimizan F1 Score
randomcv.fit(x_train,y_train)

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not used.

Parameters: { "ccp_alpha", "criterion" } are not

In [60]:
#Mejores hiperparametros
print(randomcv.best_params_)

#Mejor métrica
mse = randomcv.best_score_
print("RMSE en datos de entrnamiento: " + str(np.sqrt(mse)))

{'random_state': 4, 'max_depth': 2, 'criterion': 'entropy', 'ccp_alpha': 0.025}
RMSE en datos de entrnamiento: 88976.8209065025


Entrenamos el modelo

In [61]:
xgb_model = xgb.XGBRegressor().set_params(**randomcv.best_params_)
xgb_model.fit(x_train, y_train)

Parameters: { "ccp_alpha", "criterion" } are not used.



Medimos que tan precisas fueron las predicciones usando RMSE. La ventaja de usar RMSE en vez de MSE es que el valor de RMSE está en las mismas unidades que la variable target. De esta forma es más fácil dimensionar que tan preciso es el modelo creado

In [62]:
from sklearn.metrics import mean_squared_error


y_pred = xgb_model.predict(x_test)
#MSE 121489.55

rmse = mean_squared_error(y_test, y_pred, squared=False)
#print("MSE: %.2f" % (mse**(1/2.0)))
print("RMSE en datos de test: " + str(rmse))

RMSE en datos de test: 160612.63188743714


Vemos que la performance de entrenamiento y la del conjunto de evaluación es similar. Esto significa que no hay overfitting, es decir, el modelo no se aprende de 'memoria' los datos de entrenamiento y no pierde precision al realizar las predicciones en el conjunto de evaluación. 

In [63]:
#Atributos considerados y su importancia

features = ["latitud", "longitud", "property_rooms", "propperty_bedrooms", "property_surface_total", "property_surface_covered", "property_type_PH", "property_type_Departamento"]
feat_imps = xgb_model.feature_importances_

for feat_imp,feat in sorted(zip(feat_imps,features)):
  if feat_imp>0:
    print('{}: {}'.format(feat,feat_imp))


latitud: 0.02070087194442749
propperty_bedrooms: 0.031587474048137665
property_rooms: 0.04968720301985741
property_surface_covered: 0.17740604281425476
longitud: 0.19059129059314728
property_surface_total: 0.5300271511077881
