In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

import scoring as score # para hacer los reportes de puntajes
from time import time

from sklearn import preprocessing
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RandomizedSearchCV, ShuffleSplit

properati = pd.read_csv('datos/caba_para_mapa.csv',index_col=0,error_bad_lines=False)

#tenia un nan nose porque
properati.dropna(inplace=True)

<IPython.core.display.Javascript object>

## Preparo las columnas a usar

In [15]:
cant_buckets = 15

#buckets
cantidad,rango = np.histogram(properati['price_usd_per_m2'], bins=cant_buckets)
properati['categories_by_price']=pd.cut(properati['price_usd_per_m2'],rango,labels=np.arange(cant_buckets))
properati['price_range']=pd.cut(properati['price_usd_per_m2'],rango)
#lo casteo a float porque si no tira error 
properati['categories_by_price']=properati['categories_by_price'].astype(np.float64) 

#tenia un nan nose porque
properati.dropna(inplace=True)


In [16]:
#Creo un scoring nuevo
from sklearn.metrics import precision_score,make_scorer
precision = make_scorer(precision_score,average="macro")

### Busco una aproximacion de hiper-parametros con random search

In [18]:
%%notify

#preparo set de datos
X = zip(properati['dist_a_subte'],properati['dist_a_univ'])
y = properati['categories_by_price']

perceptron = Perceptron(n_jobs=-1)

scoring={"accuracy":"accuracy", "precision":precision} # defino diccionario para varios scorings

# Utility function to report best scores
alpha=np.arange(0.000001,1,0.000001)
pen =['l2','l1','elasticnet']
param_grid = {"alpha": alpha, "penalty": pen}

custom_cv = ShuffleSplit(n_splits=5, test_size=0.2)

# run randomized search
random_search = RandomizedSearchCV(perceptron, param_distributions=param_grid,
n_iter=500,cv=custom_cv,scoring=scoring,refit=False) #refit=False es para poder usar multiscoring
start = time()
random_search.fit(X, y)

print("RandomizedSearchCV duro %.2f segundos para %d candidatos a hyper-parametros."
    % (time() - start, len(random_search.cv_results_['params'])))
print("")
score.report_multi(random_search.cv_results_,scoring.keys())

RandomizedSearchCV duro 352.20 segundos para 500 candidatos a hyper-parametros.

Puntajes usando precision
Puesto: 1
Promedio training score: 0.014 (std: 0.005)
Promedio validation score: 0.032 (std: 0.035)
Hyper-parametros: {'penalty': 'elasticnet', 'alpha': 0.18126499999999998}

Puesto: 2
Promedio training score: 0.022 (std: 0.011)
Promedio validation score: 0.028 (std: 0.022)
Hyper-parametros: {'penalty': 'l2', 'alpha': 0.057193000000000001}

Puesto: 3
Promedio training score: 0.019 (std: 0.004)
Promedio validation score: 0.023 (std: 0.009)
Hyper-parametros: {'penalty': 'elasticnet', 'alpha': 0.043998000000000002}

Puesto: 4
Promedio training score: 0.014 (std: 0.006)
Promedio validation score: 0.023 (std: 0.017)
Hyper-parametros: {'penalty': 'elasticnet', 'alpha': 0.16363999999999998}

Puesto: 4
Promedio training score: 0.014 (std: 0.006)
Promedio validation score: 0.023 (std: 0.017)
Hyper-parametros: {'penalty': 'l2', 'alpha': 0.163712}

Puesto: 6
Promedio training score: 0.016 (s

<IPython.core.display.Javascript object>

### Busco mas detalladamente los hiper-parametros en el rango de los mejores resultados con Grid Search

In [13]:
%%notify

#preparo set de datos
X = zip(properati['dist_a_subte'],properati['dist_a_univ'])
y = properati['categories_by_price']

perceptron = Perceptron(n_jobs=-1)
        
alpha=np.arange(0.2,0.5,0.01)
pen =['l2','elasticnet']
param_grid = {"alpha": alpha, "penalty": pen}

custom_cv = ShuffleSplit(n_splits=5, test_size=0.2)

grid_search = GridSearchCV(perceptron,param_grid=param_grid,cv=custom_cv)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search.cv_results_['params'])))
score.report_single(grid_search.cv_results_)

GridSearchCV took 41.03 seconds for 60 candidate parameter settings.
Model with rank: 1
Mean training score: 0.180 (std: 0.067)
Mean validation score: 0.190 (std: 0.069)
Parameters: {'penalty': 'l2', 'alpha': 0.20000000000000001}

Model with rank: 1
Mean training score: 0.180 (std: 0.067)
Mean validation score: 0.190 (std: 0.069)
Parameters: {'penalty': 'elasticnet', 'alpha': 0.20000000000000001}

Model with rank: 3
Mean training score: 0.179 (std: 0.068)
Mean validation score: 0.187 (std: 0.073)
Parameters: {'penalty': 'l2', 'alpha': 0.21000000000000002}

Model with rank: 3
Mean training score: 0.179 (std: 0.068)
Mean validation score: 0.187 (std: 0.073)
Parameters: {'penalty': 'elasticnet', 'alpha': 0.21000000000000002}

Model with rank: 5
Mean training score: 0.177 (std: 0.070)
Mean validation score: 0.185 (std: 0.075)
Parameters: {'penalty': 'l2', 'alpha': 0.22000000000000003}

Model with rank: 5
Mean training score: 0.177 (std: 0.070)
Mean validation score: 0.185 (std: 0.075)
Para

<IPython.core.display.Javascript object>