In [1373]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

data = pd.read_csv("setEntrenamiento.csv", low_memory = False)
data = data.drop('Unnamed: 0', axis = 1)
data['created_on'] = pd.to_datetime(data['created_on'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86846 entries, 0 to 86845
Data columns (total 22 columns):
created_on                 86846 non-null datetime64[ns]
property_type              86846 non-null object
lat                        86846 non-null float64
lon                        86846 non-null float64
barrio                     86846 non-null object
price_aprox_usd            86846 non-null float64
price_usd_per_m2           86846 non-null float64
surface_total_in_m2        86846 non-null float64
surface_covered_in_m2      86846 non-null float64
zona                       86846 non-null object
pileta                     86846 non-null int64
cochera                    86846 non-null int64
aire acondicionado         86846 non-null int64
seguridad                  86846 non-null int64
patio                      86846 non-null int64
amenities                  86846 non-null int64
distanciaCentro            86846 non-null float64
distanciaAIndoamericano    86846 non-null float64

In [1374]:
def fechaNumerica(x):
    return 10000*x.year + 100*x.month + x.day

In [1375]:
dicc_tipos_prop = {"apartment" : 1, "house" : 2, "PH" : 3, "store" : 4}
dicc_tipos_zona = {"Bs.As. G.B.A. Zona Norte": 1, "Bs.As. G.B.A. Zona Oeste": 2, "Bs.As. G.B.A. Zona Sur": 3, "Capital Federal": 4, "Buenos Aires Interior":5 }

In [1376]:
data.loc[:, 'property_type'] = data.property_type.apply(lambda x: dicc_tipos_prop.get(x))
data.loc[:, 'zona'] = data.zona.apply(lambda x: dicc_tipos_zona.get(x))
data.loc[:, 'created_on'] = data.created_on.apply(fechaNumerica)
data['distanciaCentro'] = data['distanciaCentro'].apply(lambda x: x/1000)
data['distanciaACarlosGardel'] = data['distanciaACarlosGardel'].apply(lambda x: x/1000)
data['distanciaAFuerteApache'] = data['distanciaAFuerteApache'].apply(lambda x: x/1000)
data['distanciaAIndoamericano'] = data['distanciaAIndoamericano'].apply(lambda x: x/1000)

In [1378]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import  f_regression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline


In [1400]:
columnas = data.drop(["price_aprox_usd", 'barrio', 'price_usd_per_m2', 'patio', 'distanciaACarlosGardel'], axis =1).columns.values
len(columnas)

17

In [1401]:
def func(x, y):
    return f_regression(x,y, center = False)


x = np.array(data.loc[:,columnas])
y = np.array(data['price_aprox_usd'])

featureSelector = SelectKBest(score_func= func, k=17)
featureSelector.fit(x,y)
bestCol = ([columnas[zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))])
print(bestCol)

['created_on', 'property_type', 'lat', 'lon', 'surface_total_in_m2', 'surface_covered_in_m2', 'zona', 'pileta', 'cochera', 'aire acondicionado', 'seguridad', 'amenities', 'distanciaCentro', 'distanciaAIndoamericano', 'distanciaAFuerteApache', 'subtes_cercanos', 'trenes_cercanos']


In [1402]:
x = np.array(data.loc[:, bestCol])
y = np.array(data['price_aprox_usd'])
x.shape

In [1449]:
#XGBOOST (Mejor con max_depth= 10 columnas = 19, n_estimators = 300)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
clf =XGBRegressor( n_estimators= 300, max_depth = 10)

In [1405]:
clf.fit(x_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [1406]:
y_pred1 = clf.predict(x_test)
print(r2_score(y_test, y_pred1))

0.814177421751


In [1450]:
np.sqrt(mean_squared_error(y_test, y_pred1))

149281.59278338263

In [1433]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
params = { 'n_estimators': [290,300], 'max_depth': [9,10] }
xgb = XGBRegressor() 
grid = GridSearchCV(xgb, params)
grid.fit(x, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [300], 'max_depth': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [1451]:
y_pred = grid.best_estimator_.predict(x_test)# ultimo
print(r2_score(y_test, grid.best_estimator_.predict(x_test))) 

0.966213097377


In [1435]:
avg_score = []
for i in range(len(y_test)):
    avg_score.append((y_pred[i] + y_pred1[i])/2)

In [1438]:
np.sqrt(mean_squared_error(y_test, avg_score))

98699.936728828281

In [1439]:
np.sqrt(mean_squared_error(y_test, y_pred))

63654.807556292559

In [1441]:
test = pd.read_csv("setTesteo.csv")
test = test.drop('Unnamed: 0', axis = 1)
test['created_on'] = pd.to_datetime(test['created_on'])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 21 columns):
id                         14166 non-null int64
created_on                 14166 non-null datetime64[ns]
property_type              14166 non-null object
lat                        14166 non-null float64
lon                        14166 non-null float64
barrio                     14166 non-null object
surface_total_in_m2        14166 non-null float64
surface_covered_in_m2      14166 non-null float64
zona                       14166 non-null object
pileta                     14166 non-null int64
cochera                    14166 non-null int64
aire acondicionado         14166 non-null int64
seguridad                  14166 non-null int64
patio                      14166 non-null int64
amenities                  14166 non-null int64
distanciaCentro            14166 non-null float64
distanciaAIndoamericano    14166 non-null float64
distanciaAFuerteApache     14166 non-null float64
d

In [1442]:
test1 = test.drop(['barrio', 'id'], axis = 1)
test1.loc[:, 'property_type'] = test.property_type.apply(lambda x: dicc_tipos_prop.get(x))
test1.loc[:, 'zona'] = test.zona.apply(lambda x: dicc_tipos_zona.get(x))
test1.loc[:, 'created_on'] = test.created_on.apply(fechaNumerica)
test1['distanciaCentro'] = test['distanciaCentro'].apply(lambda x: x/1000)
test1['distanciaACarlosGardel'] = test['distanciaACarlosGardel'].apply(lambda x: x/1000)
test1['distanciaAFuerteApache'] = test['distanciaAFuerteApache'].apply(lambda x: x/1000)
test1['distanciaAIndoamericano'] = test['distanciaAIndoamericano'].apply(lambda x: x/1000)

In [1443]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 19 columns):
created_on                 14166 non-null int64
property_type              14166 non-null int64
lat                        14166 non-null float64
lon                        14166 non-null float64
surface_total_in_m2        14166 non-null float64
surface_covered_in_m2      14166 non-null float64
zona                       14166 non-null int64
pileta                     14166 non-null int64
cochera                    14166 non-null int64
aire acondicionado         14166 non-null int64
seguridad                  14166 non-null int64
patio                      14166 non-null int64
amenities                  14166 non-null int64
distanciaCentro            14166 non-null float64
distanciaAIndoamericano    14166 non-null float64
distanciaAFuerteApache     14166 non-null float64
distanciaACarlosGardel     14166 non-null float64
subtes_cercanos            14166 non-null int64
trenes_cerc

In [1444]:
testeo = np.array(test1[bestCol])
testing = grid.best_estimator_.predict(testeo)

In [1446]:
test['price_usd'] = testing

In [1447]:
testFinal = test.loc[:,['id', 'price_usd']]
testFinal.to_csv('pricePrediction.csv', sep=',', index = False)

In [1198]:
x = np.array(data.loc[:, bestCol])
y = np.array(data['price_aprox_usd'])

In [None]:
#Random Forest
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)
params = { 'n_estimators': [300], 'max_depth': [10] }
rf = RandomForestRegressor() 
grid = GridSearchCV(rf, params)
grid.fit(x, y)

In [None]:
y_pred2 = rf.predict(x_test)
print(r2_score(y_test, y_pred2))

In [1367]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)
rf = RandomForestRegressor( n_estimators= 300, max_depth = 10)

In [1368]:
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [1369]:
y_pred2 = rf.predict(x_test)
print(r2_score(y_test, y_pred2))

0.756796384064


In [895]:
y_pred2 = grid.best_estimator_.predict(x_test)
print(r2_score(y_test, y_pred2))

0.963983789151


In [897]:
y_pred2 = grid.best_estimator_.predict(x_test)
print(r2_score(y_test, y_pred2))

0.964116919622


In [292]:
#Algoritmo de Linear Regression
from sklearn.metrics import mean_squared_error
def linearR(x_train, x_test, y_train, y_test):
    linreg = LinearRegression()
    linreg.fit(x_train, y_train)
    y_pred = linreg.predict(x_test)
    return mean_squared_error(y_test, y_pred)

In [None]:
#prueba todas las combinaciones
resultados = []
y = np.array(data['price_aprox_usd'])

for j in list_columnas: 
    x = np.array(data.loc[:,j])
    x_train, x_test, y_train, y_test = train_test_split(x, y , test_size = 0.33, random_state = 42)
    resultados.append(linearR(x_train, x_test, y_train, y_test))

In [26]:
x = np.array(data.loc[:,list_columnas[resultados.index(min(resultados))]])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [27]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred = linreg.predict(x_test)

In [None]:
predicc = []
for i in range(len(y_test)):
    predicc.append((y_test[i], y_pred[i]))

In [181]:
#Implemento CART: Clasification and Regression Trees
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(max_depth = 2)
dt_fit = regressor.fit(x_train, y_train)
dt_scores = cross_val_score(dt_fit, x_train, y_train, cv = 5)
np.mean(dt_scores)


0.049949933277721417

In [185]:
x = np.array(data.loc[:,bestCol])
y = np.array(data['price_aprox_usd'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

dt = DecisionTreeRegressor(max_depth = 10)
dt_fit = dt.fit(x_train, y_train)

dt_scores = cross_val_score(dt_fit, x_train, y_train, cv = 10)
print("mean cross validation score: {}".format(np.mean(dt_scores)))
print("score without cv: {}".format(dt_fit.score(x_train, y_train)))

 #on the test or hold-out set
from sklearn.metrics import r2_score
print(r2_score(y_test, dt_fit.predict(x_test)))
print(dt_fit.score(x_test, y_test))

mean cross validation score: 0.5874589287405033
score without cv: 0.8232761051115904
0.683320358604
0.683320358604


In [214]:
def GradientBoost(x_train, x_test, y_train, y_test, k):
    graBoost = GradientBoostingRegressor(n_estimators= k, max_depth = 2, min_samples_split =2)
    graBoost.fit(x_train, y_train)
    y_pred = graBoost.predict(x_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [112]:
#Buscando la mejor cantidad de arboles ( usar 150 arboles, despues no mejora mucho)
resultados = []
for k in range(100, 201, 10): 
    resultados.append(GradientBoost(x_train, x_test, y_train, y_test, k ))