In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

import scoring as score # para hacer los reportes de puntajes
from time import time

from sklearn import preprocessing
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RandomizedSearchCV, ShuffleSplit, train_test_split

properati = pd.read_csv('datos/caba_para_mapa.csv',error_bad_lines=False)

<IPython.core.display.Javascript object>

## Preparo las columnas a usar

In [2]:
cant_buckets = 500

#buckets
cantidad,rango = np.histogram(properati['price_usd_per_m2'], bins=cant_buckets)
properati['categories_by_price']=pd.cut(properati['price_usd_per_m2'],rango,labels=np.arange(cant_buckets))
properati['price_range']=pd.cut(properati['price_usd_per_m2'],rango)
#lo casteo a float porque si no tira error 
properati['categories_by_price']=properati['categories_by_price'].astype(np.float64) 

#tenia un nan nose porque
properati.dropna(inplace=True)


### Busco una aproximacion de hiper-parametros con random search

In [None]:
%%notify

#preparo set de datos
X = zip(properati['dist_a_subte'],properati['dist_a_univ'],properati['dist_a_tren'],properati['dist_a_villa'],\
        properati['dist_a_zona_anegada'],properati['surface_total_in_m2'],\
        properati['surface_covered_in_m2'],properati['lat'],properati['lon'])
y = properati['categories_by_price']

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler2= MinMaxScaler()

X=scaler.fit_transform(X,y)
X=scaler2.fit_transform(X,y)

perceptron = Perceptron(n_jobs=-1)

scoring={"accuracy":"accuracy"} # defino diccionario para varios scorings

# Utility function to report best scores
alpha=np.arange(0.000001,1,0.000001)
pen =['l2','l1','elasticnet']
param_grid = {"alpha": alpha, "penalty": pen}

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

# run randomized search
random_search = RandomizedSearchCV(perceptron, param_distributions=param_grid,
n_iter=100,cv=5) #refit=False es para poder usar multiscoring
start = time()
random_search.fit(X_train, y_train)

print("RandomizedSearchCV duro %.2f segundos para %d candidatos a hiper-parametros."
    % (time() - start, len(random_search.cv_results_['params'])))
print("")
score.report_multi(random_search.cv_results_,scoring.keys())



### Busco mas detalladamente los hiper-parametros en el rango de los mejores resultados con Grid Search

In [None]:
%%notify

#preparo set de datos
X = zip(properati['dist_a_subte'],properati['dist_a_univ'])
y = properati['categories_by_price']

perceptron = Perceptron(n_jobs=-1)
        
alpha=np.arange(0.2,0.5,0.01)
pen =['l2','elasticnet']
param_grid = {"alpha": alpha, "penalty": pen}

custom_cv = ShuffleSplit(n_splits=5, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

grid_search = GridSearchCV(perceptron,param_grid=param_grid,cv=custom_cv)
start = time()
grid_search.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time() - start, len(grid_search.cv_results_['params'])))
score.report_single(grid_search.cv_results_)

In [None]:
mejor_rf = grid_search.best_estimator_
print mejor_rf

In [None]:
errores = mejor_rf.predict(X_test)-y_test
print ("Error maximo:{0}\nError minimo:{1}".format( max(abs(errores)),min(abs(errores))))
print(errores)

In [None]:
count_max=0
max_error=100
lista=[]
for error in errores:
    if abs(error)>max_error:
        count_max+=1
        lista.append(abs(error))
count_max

In [None]:
# the histogram of the data
plt.figure(figsize=(12,8))
plt.hist(errores, 100, facecolor='blue')
plt.xlabel('Errores')
plt.ylabel('Cantidad')
#plt.xlim(-1000, 1000) #para variar el "zoom a 0"
plt.yscale('log')
plt.show()