In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

data = pd.read_csv("setTesteo.csv", low_memory = False)
data = data.drop('Unnamed: 0', axis = 1)
data['created_on'] = pd.to_datetime(data['created_on'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 20 columns):
created_on                 14166 non-null datetime64[ns]
property_type              14166 non-null object
lat                        14166 non-null float64
lon                        14166 non-null float64
barrio                     14166 non-null object
surface_total_in_m2        14166 non-null float64
surface_covered_in_m2      14166 non-null float64
zona                       14166 non-null object
pileta                     14166 non-null int64
cochera                    14166 non-null int64
aire acondicionado         14166 non-null int64
seguridad                  14166 non-null int64
patio                      14166 non-null int64
amenities                  14166 non-null int64
distanciaCentro            14166 non-null float64
distanciaAIndoamericano    14166 non-null float64
distanciaAFuerteApache     14166 non-null float64
distanciaACarlosGardel     14166 non-null float64

In [95]:
def fechaNumerica(x):
    return 10000*x.year + 100*x.month + x.day

In [96]:
dicc_tipos_prop = {"apartment" : 1, "house" : 2, "PH" : 3, "store" : 4}
dicc_tipos_zona = {"Bs.As. G.B.A. Zona Norte": 1, "Bs.As. G.B.A. Zona Oeste": 2, "Bs.As. G.B.A. Zona Sur": 3, "Capital Federal": 4 }
data.loc[:, 'property_type'] = data.property_type.apply(lambda x: dicc_tipos_prop.get(x))
data.loc[:, 'zona'] = data.zona.apply(lambda x: dicc_tipos_zona.get(x))
data.loc[:, 'created_on'] = data.created_on.apply(fechaNumerica)

In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86846 entries, 0 to 86845
Data columns (total 22 columns):
created_on                 86846 non-null int64
property_type              86846 non-null int64
lat                        86846 non-null float64
lon                        86846 non-null float64
barrio                     86846 non-null object
price_aprox_usd            86846 non-null float64
price_usd_per_m2           86846 non-null float64
surface_total_in_m2        86846 non-null float64
surface_covered_in_m2      86846 non-null float64
zona                       86846 non-null int64
pileta                     86846 non-null int64
cochera                    86846 non-null int64
aire acondicionado         86846 non-null int64
seguridad                  86846 non-null int64
patio                      86846 non-null int64
amenities                  86846 non-null int64
distanciaCentro            86846 non-null float64
distanciaAIndoamericano    86846 non-null float64
distanciaA

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

columnas = ['created_on', 'property_type','pileta', 'lat', 'lon' ,'cochera', 'surface_total_in_m2',\
            'surface_covered_in_m2', 'seguridad', 'amenities','zona' ,\
            'subtes_cercanos', 'trenes_cercanos']

x = np.array(data.loc[:,columnas])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [52]:
#Algoritmo de Linear Regression
from sklearn.metrics import mean_squared_error
def linearR(x_train, x_test, y_train, y_test):
    linreg = LinearRegression()
    linreg.fit(x_train, y_train)
    y_pred = linreg.predict(x_test)
    return mean_squared_error(y_test, y_pred)

In [91]:
#Creo todas las combinaciones posibles con estas columnas
import itertools

columnas = ['created_on', 'property_type','pileta', 'lat', 'lon' ,'cochera', 'surface_total_in_m2',\
            'surface_covered_in_m2', 'seguridad', 'amenities','zona' ,\
            'subtes_cercanos', 'trenes_cercanos']

list_columnas = []

for r in range(2, len(columnas)):

    res = itertools.combinations(columnas, r)
    for e in res:
        list_columnas.append(list(e))
list_columnas

[['created_on', 'property_type'],
 ['created_on', 'pileta'],
 ['created_on', 'lat'],
 ['created_on', 'lon'],
 ['created_on', 'cochera'],
 ['created_on', 'surface_total_in_m2'],
 ['created_on', 'surface_covered_in_m2'],
 ['created_on', 'seguridad'],
 ['created_on', 'amenities'],
 ['created_on', 'zona'],
 ['created_on', 'subtes_cercanos'],
 ['created_on', 'trenes_cercanos'],
 ['property_type', 'pileta'],
 ['property_type', 'lat'],
 ['property_type', 'lon'],
 ['property_type', 'cochera'],
 ['property_type', 'surface_total_in_m2'],
 ['property_type', 'surface_covered_in_m2'],
 ['property_type', 'seguridad'],
 ['property_type', 'amenities'],
 ['property_type', 'zona'],
 ['property_type', 'subtes_cercanos'],
 ['property_type', 'trenes_cercanos'],
 ['pileta', 'lat'],
 ['pileta', 'lon'],
 ['pileta', 'cochera'],
 ['pileta', 'surface_total_in_m2'],
 ['pileta', 'surface_covered_in_m2'],
 ['pileta', 'seguridad'],
 ['pileta', 'amenities'],
 ['pileta', 'zona'],
 ['pileta', 'subtes_cercanos'],
 ['pil

In [54]:
#prueba todas las combinaciones
resultados = []
y = np.array(data['price_aprox_usd'])

for j in list_columnas: 
    x = np.array(data.loc[:,j])
    x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.33, random_state = 42)
    resultados.append(linearR(x_train, x_test, y_train, y_test))

In [55]:
resultados.index(min(resultados))

8164

In [56]:
list_columnas[resultados.index(min(resultados))]

['created_on',
 'property_type',
 'pileta',
 'lat',
 'lon',
 'cochera',
 'surface_total_in_m2',
 'surface_covered_in_m2',
 'seguridad',
 'amenities',
 'zona',
 'subtes_cercanos']

In [57]:
min(resultados)

74589590721.942474

In [26]:
x = np.array(data.loc[:,list_columnas[resultados.index(min(resultados))]])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [27]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred = linreg.predict(x_test)

In [28]:
predicc = []
for i in range(len(y_test)):
    predicc.append((y_test[i], y_pred[i]))

predicc

[(2350000.0, 421480.61498134211),
 (215000.0, 96320.110379103571),
 (63500.0, 171782.84035146609),
 (325000.0, 210011.03291362524),
 (130000.0, 15054.471521213651),
 (88163.539999999994, 196804.53010691702),
 (75000.0, 69836.583356346935),
 (3500000.0, 690320.35952822492),
 (450000.0, 595818.69359013066),
 (90000.0, 186897.51226060838),
 (37000.0, 125273.84115623683),
 (195000.0, 458352.56042315438),
 (190000.0, 179651.92753851786),
 (105000.0, 156781.6922352165),
 (220000.0, 520569.48749127239),
 (22470.299999999999, 177261.2945933193),
 (450000.0, 712943.38543935493),
 (169000.0, 195037.13891832158),
 (116000.0, 166232.1424583979),
 (85000.0, 229372.21611203253),
 (110000.0, 273016.28153819963),
 (570000.0, 425126.74601012841),
 (387569.95000000001, 256423.00908441469),
 (92000.0, 187347.8122247979),
 (530000.0, 221680.65054818615),
 (152000.0, 375034.24704695866),
 (110000.0, 365214.73956322297),
 (79000.0, 143977.52561352775),
 (1100000.0, 465090.40424180403),
 (85000.0, 215345.824

In [29]:
#Implemento CART: Clasification and Regression Trees
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(max_depth = 2)
dt_fit = regressor.fit(x_train, y_train)
dt_scores = cross_val_score(dt_fit, x_train, y_train, cv = 5)
np.mean(dt_scores)


0.4111797559264323

In [40]:
x = np.array(data.loc[:,list_columnas[resultados.index(min(resultados))]])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

dt = DecisionTreeRegressor(max_depth = 10)
dt_fit = dt.fit(x_train, y_train)

dt_scores = cross_val_score(dt, x_train, y_train, cv = 10)
print("mean cross validation score: {}".format(np.mean(dt_scores)))
print("score without cv: {}".format(dt_fit.score(x_train, y_train)))

 #on the test or hold-out set
from sklearn.metrics import r2_score
print(r2_score(y_test, dt_fit.predict(x_test)))
print(dt_fit.score(x_test, y_test))

mean cross validation score: 0.5830482227253608
score without cv: 0.8232761051115904
0.687591790507
0.687591790507


In [41]:
y_pred = dt_fit.predict(x_test)
predicc = []
for i in range(len(y_test)):
    predicc.append((y_test[i], y_pred[i]))

predicc

[(2800000.0, 2780000.0),
 (110000.0, 113998.89464579898),
 (220003.47, 172296.9808983051),
 (270000.0, 330323.16666666669),
 (123000.0, 113998.52159999999),
 (340000.0, 494508.87204545457),
 (150000.0, 219348.49427835055),
 (190000.0, 225268.64304627251),
 (149000.0, 140165.44041074254),
 (75000.0, 91273.746615384662),
 (200000.0, 282966.98326829274),
 (120000.0, 105451.10200345423),
 (245000.0, 395154.81632653059),
 (55000.0, 81206.645291576686),
 (112000.0, 103887.89362911264),
 (1300000.0, 1424770.1149425288),
 (265000.0, 240605.95104587157),
 (145000.0, 160527.27554526745),
 (93994.0, 91361.104000000021),
 (298000.0, 397485.55035128805),
 (87530.0, 93733.095714285708),
 (140000.0, 164517.51894545453),
 (85512.910000000003, 111958.80381516588),
 (240000.0, 299751.90880872484),
 (92000.0, 72135.174609375012),
 (535000.0, 603085.9190556492),
 (280000.0, 331216.49484536081),
 (480000.0, 315968.50783783779),
 (405000.0, 397485.55035128805),
 (83000.0, 81876.381746133993),
 (650000.0, 24

In [98]:
#XGBOOST

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

x = np.array(data.loc[:,list_columnas[8164]])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

clf = GradientBoostingRegressor(n_estimators= 50, max_depth = 5, min_samples_split =2, learning_rate = 0.1, loss = 'ls')


In [99]:
clf.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [100]:
clf.score(x_test, y_test)

0.75458843385902219

In [101]:
y_pred = clf.predict(x_test)
predicc = []
for i in range(len(y_test)):
    predicc.append((y_test[i], y_pred[i]))

predicc

[(2800000.0, 2043862.1623898097),
 (110000.0, 120860.49868337938),
 (220003.47, 178093.30470519903),
 (270000.0, 293371.19945349405),
 (123000.0, 134914.0052673493),
 (340000.0, 357486.95995410683),
 (150000.0, 210742.66614290583),
 (190000.0, 250469.9761033612),
 (149000.0, 158324.28359952781),
 (75000.0, 85290.990872472321),
 (200000.0, 273927.26822961547),
 (120000.0, 107175.85409091187),
 (245000.0, 431844.36628811195),
 (55000.0, 79395.179332325482),
 (112000.0, 104099.691664928),
 (1300000.0, 1534613.0631784948),
 (265000.0, 258736.63969918722),
 (145000.0, 139023.31687965139),
 (93994.0, 108442.36783646489),
 (298000.0, 374043.26521952829),
 (87530.0, 102118.56301255221),
 (140000.0, 153266.17387302735),
 (85512.910000000003, 103354.50005557785),
 (240000.0, 308249.05099750019),
 (92000.0, 88646.647470999029),
 (535000.0, 794943.05470878899),
 (280000.0, 422040.82029949513),
 (480000.0, 368829.039031878),
 (405000.0, 389466.75774801668),
 (83000.0, 92407.574827243836),
 (650000.

In [102]:
mean_squared_error(y_test, y_pred)

174709.94793525207

In [104]:
def GradientBoost(x_train, x_test, y_train, y_test, k):
    graBoost = GradientBoostingRegressor(n_estimators= k, max_depth = 5, min_samples_split =2, learning_rate = 0.1, loss = 'ls')
    graBoost.fit(x_train, y_train)
    y_pred = graBoost.predict(x_test)
    return mean_squared_error(y_test, y_pred)

In [112]:
#Buscando la mejor cantidad de arboles ( usar 150 arboles, despues no mejora mucho)
resultados = []
for k in range(100, 201, 10): 
    resultados.append(GradientBoost(x_train, x_test, y_train, y_test, k ))

In [120]:
#Buscando la mejor configuracion de columnas
resultados = []

y = np.array(data['price_aprox_usd'])

for col in list_columnas:
    x = np.array(data.loc[:,col])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=7)
    resultados.append(GradientBoost(x_train, x_test, y_train, y_test, 1))

In [121]:
resultados

[98323683088.513672,
 98323683088.513657,
 98324068309.421906,
 98298937165.544281,
 98298937165.544281,
 101112830978.08922,
 106020844298.07985,
 106038530054.57251,
 107576080716.69269,
 108382722387.97682,
 108760621106.44112]

In [119]:
columnas = ['created_on', 'property_type','pileta', 'lat', 'lon' ,'cochera', 'surface_total_in_m2',\
            'surface_covered_in_m2', 'seguridad', 'amenities','zona' ,\
            'subtes_cercanos', 'trenes_cercanos']

list_columnas = []

for i in range(1, len(columnas) - 1):
    list_columnas.append(columnas[:len(columnas) - i])
list_columnas

[['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2',
  'surface_covered_in_m2',
  'seguridad',
  'amenities',
  'zona',
  'subtes_cercanos'],
 ['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2',
  'surface_covered_in_m2',
  'seguridad',
  'amenities',
  'zona'],
 ['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2',
  'surface_covered_in_m2',
  'seguridad',
  'amenities'],
 ['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2',
  'surface_covered_in_m2',
  'seguridad'],
 ['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2',
  'surface_covered_in_m2'],
 ['created_on',
  'property_type',
  'pileta',
  'lat',
  'lon',
  'cochera',
  'surface_total_in_m2'],
 ['created_on', 'property_type', 'pileta', 'lat', 'lon', 'cochera'],
 ['created_on', 'property_