In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

data = pd.read_csv("dataFiltrada.csv", low_memory = False)
data = data.loc[data['price'] < 15000000,:].drop('Unnamed: 0', axis = 1)
data['created_on'] = pd.to_datetime(data['created_on'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45997 entries, 0 to 46002
Data columns (total 23 columns):
index                      45997 non-null int64
created_on                 45997 non-null datetime64[ns]
operation                  45997 non-null object
property_type              45997 non-null object
state_name                 45997 non-null object
place_name                 45997 non-null object
lat                        45997 non-null float64
lon                        45997 non-null float64
price                      45997 non-null float64
surface_total_in_m2        45997 non-null float64
price_usd_per_m2           45997 non-null float64
media_por_barrio           45997 non-null float64
cant_prop                  45997 non-null float64
mediana_por_barrio         45997 non-null float64
geoPos                     45997 non-null object
distancia_al_centro        45997 non-null float64
distanciaAIndoamericano    45997 non-null float64
distanciaAFuerteApache     45997 non-null

In [6]:
def tipoDePropiedad(x):
    tipos = {
        "apartment" : 1,
        "house" : 2,
        "PH" : 3,
        "store" : 4
    }
    return tipos[x] if x in tipos else x


def fechaNumerica(x):
    return 10000*x.year + 100*x.month + x.day

In [7]:
data.loc[:, 'property_type'] = data.property_type.apply(tipoDePropiedad)
#data.loc[:, 'distancia_al_centro'] = data.distancia_al_centro.apply(lambda x: int(x))
data.loc[:, 'created_on'] = data.created_on.apply(fechaNumerica)
dataCapital = data.loc[data.state_name.str.contains('Capital Federal'), :]

In [8]:
dataCapital.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23913 entries, 3214 to 44445
Data columns (total 23 columns):
index                      23913 non-null int64
created_on                 23913 non-null int64
operation                  23913 non-null object
property_type              23913 non-null int64
state_name                 23913 non-null object
place_name                 23913 non-null object
lat                        23913 non-null float64
lon                        23913 non-null float64
price                      23913 non-null float64
surface_total_in_m2        23913 non-null float64
price_usd_per_m2           23913 non-null float64
media_por_barrio           23913 non-null float64
cant_prop                  23913 non-null float64
mediana_por_barrio         23913 non-null float64
geoPos                     23913 non-null object
distancia_al_centro        23913 non-null float64
distanciaAIndoamericano    23913 non-null float64
distanciaAFuerteApache     23913 non-null float6

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

columnas = ['created_on', 'property_type', 'lat', 'lon', 'surface_total_in_m2',\
            'cant_prop', 'distancia_al_centro', 'distanciaAIndoamericano',\
            'cant_subtes_cercanos', 'cant_trenes_cercanos', 'distancia_delitoMenor', 'distancia_delitoGrave']

x = np.array(dataCapital.loc[:,columnas])
y = np.array(dataCapital['price_usd_per_m2'])
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
from sklearn.metrics import mean_squared_error
def linearR(x_train, x_test, y_train, y_test):
    linreg = LinearRegression()
    linreg.fit(x_train, y_train)
    y_pred = linreg.predict(x_test)
    #return y_pred
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
np.sqrt(mean_squared_error(y_test, pred))

In [14]:
import itertools


columnas = ['created_on', 'property_type', 'lat', 'lon', 'surface_total_in_m2',\
            'cant_prop', 'distancia_al_centro', 'distanciaAIndoamericano',\
            'cant_subtes_cercanos', 'cant_trenes_cercanos', 'distancia_delitoMenor', 'distancia_delitoGrave']

list_columnas = []

for r in range(2, len(columnas)):

    res = itertools.combinations(columnas, r)
    for e in res:
        list_columnas.append(list(e))
list_columnas

[['created_on', 'property_type'],
 ['created_on', 'lat'],
 ['created_on', 'lon'],
 ['created_on', 'surface_total_in_m2'],
 ['created_on', 'cant_prop'],
 ['created_on', 'distancia_al_centro'],
 ['created_on', 'distanciaAIndoamericano'],
 ['created_on', 'cant_subtes_cercanos'],
 ['created_on', 'cant_trenes_cercanos'],
 ['created_on', 'distancia_delitoMenor'],
 ['created_on', 'distancia_delitoGrave'],
 ['property_type', 'lat'],
 ['property_type', 'lon'],
 ['property_type', 'surface_total_in_m2'],
 ['property_type', 'cant_prop'],
 ['property_type', 'distancia_al_centro'],
 ['property_type', 'distanciaAIndoamericano'],
 ['property_type', 'cant_subtes_cercanos'],
 ['property_type', 'cant_trenes_cercanos'],
 ['property_type', 'distancia_delitoMenor'],
 ['property_type', 'distancia_delitoGrave'],
 ['lat', 'lon'],
 ['lat', 'surface_total_in_m2'],
 ['lat', 'cant_prop'],
 ['lat', 'distancia_al_centro'],
 ['lat', 'distanciaAIndoamericano'],
 ['lat', 'cant_subtes_cercanos'],
 ['lat', 'cant_trenes_c

In [15]:
resultados = []
y = np.array(dataCapital['price_usd_per_m2'])

for j in list_columnas: 
    x = np.array(dataCapital.loc[:,j])
    x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.33, random_state = 42)
    resultados.append(linearR(x_train, x_test, y_train, y_test))

In [18]:
resultados.index(min(resultados))

4074

In [19]:
list_columnas[4074]

['created_on',
 'property_type',
 'lat',
 'lon',
 'surface_total_in_m2',
 'cant_prop',
 'distancia_al_centro',
 'cant_subtes_cercanos',
 'cant_trenes_cercanos',
 'distancia_delitoMenor',
 'distancia_delitoGrave']

In [20]:
x = np.array(dataCapital.loc[:,list_columnas[resultados.index(min(resultados))]])
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.33, random_state = 42)
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred = linreg.predict(x_test)

In [21]:

predicc = []
for i in range(len(y_test)):
    predicc.append((y_test[i], y_pred[i]))

predicc

[(3717.1764705882342, 2490.9897815473378),
 (3608.695652173913, 2456.5663683251478),
 (2836.7843137254908, 2700.7088723843917),
 (1567.5675675675675, 1579.7257890528999),
 (2720.36170212766, 2746.594809660688),
 (4584.6153846153848, 2330.7962207100354),
 (1477.7777777777774, 2102.4368956210092),
 (1952.1432692307687, 1540.3505474911071),
 (3110.1607142857142, 2728.0986815132201),
 (2838.427947598253, 3425.2023756620474),
 (4166.666666666667, 4200.8644737591967),
 (1818.1818181818185, 1819.4936991995201),
 (672.64573991031386, 1682.0687562874518),
 (2500.0, 1638.9434625916183),
 (2101.449275362319, 1705.1799636422656),
 (3420.0, 2851.1720466581173),
 (2365.853658536585, 2037.1735427728854),
 (1000.0, 1500.5707905725576),
 (3251.7482517482517, 4230.3705311901867),
 (1460.8433734939758, 2082.2203207896091),
 (1182.5726141078835, 1788.7104481002316),
 (1894.7368421052633, 3128.0227365880273),
 (2608.695652173913, 2363.625543102622),
 (2241.3793103448274, 2515.2953501665033),
 (3688.5245901

In [None]:
#Implemento CART: Clasification and Regression Trees

