In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

In [2]:
# SELECCIÓN DEL MODELO Y ENTRENAMIENTO

# Lectura del dataset

dataset = pd.read_csv('dataset.csv')
dataset = dataset.drop('id', 1) # Para el entrenamiento no usamos los ids

dataset.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2013.333,17.6,1805.665,2,24.98672,121.52091,31.1
1,2013.083,8.1,104.8101,5,24.96674,121.54067,51.6
2,2013.25,2.3,184.3302,6,24.96581,121.54086,45.4
3,2013.333,7.1,379.5575,10,24.98343,121.53762,49.8
4,2013.0,13.0,750.0704,2,24.97371,121.54951,37.0


In [3]:
# Preprocesado

# Se separa la columna a predecir
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

# En este ejemplo, se reescalan con un MinMaxScaler
sc = MinMaxScaler()
sc.fit(X) # Se ajusta el reescalador
X = sc.transform(X)

In [4]:
# Dividir el dataset en train y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [5]:
# SELECCION DE HIPERPARAMETROS POR VALIDACION CRUZADA

# Modelo: Bayesian Ridge

parameters = {'n_iter':[100, 200, 300, 500],\
              'alpha_1':[1e-6, 1e-4, 1e-1, 1], \
              'alpha_2':[1e-4, 1e-10, 1e-1, 1], \
              'lambda_1':[1e-6, 1e-10, 1e-1, 1],\
              'lambda_2':[1e-6, 1e-4]}

bay_ridge = BayesianRidge()
cv = GridSearchCV(bay_ridge, parameters, refit = True, scoring='r2', verbose = 3)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100, score=0.256, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100, score=0.484, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100, score=0.639, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100, score=0.651, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-06, lambda_2=1e-06, n_iter=100, score=0.595, total=   0

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=100, score=0.595, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200, score=0.256, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200, score=0.484, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200, score=0.639, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200, score=0.651, total=   0.0s
[CV] alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_2=1e-06, n_iter=200 
[CV]  alpha_1=1e-06, alpha_2=0.0001, lambda_1=1e-10, lambda_

[Parallel(n_jobs=1)]: Done 2560 out of 2560 | elapsed:   12.0s finished


GridSearchCV(estimator=BayesianRidge(),
             param_grid={'alpha_1': [1e-06, 0.0001, 0.1, 1],
                         'alpha_2': [0.0001, 1e-10, 0.1, 1],
                         'lambda_1': [1e-06, 1e-10, 0.1, 1],
                         'lambda_2': [1e-06, 0.0001],
                         'n_iter': [100, 200, 300, 500]},
             scoring='r2', verbose=3)

In [None]:
# Modelo: Red neuronal

def neural_network(neuronas_capa_1 = 25, neuronas_capa_2 = 15):
    neural_net = Sequential()
    neural_net.add(Dense(neuronas_capa_1, input_dim=X.shape[1], activation='sigmoid'))
    neural_net.add(Dense(neuronas_capa_2, activation='sigmoid'))
    neural_net.add(Dense(1, activation='linear'))

    neural_net.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    return neural_net

neu_net = KerasRegressor(build_fn=neural_network, epochs = 500)

parameters = {'neuronas_capa_1':[25, 50, 10], 'neuronas_capa_2':[25, 50, 10], 'batch_size':[20, 30]}

cv = GridSearchCV(neu_net, parameters, refit=True, scoring='r2', verbose= 3)
cv.fit(X_train, y_train)

In [6]:
# Mostrar el ranking

pd.DataFrame(cv.cv_results_).sort_values(by=['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha_1,param_alpha_2,param_lambda_1,param_lambda_2,param_n_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
120,0.003748,0.000817,0.000555,3.7e-05,1e-06,1,1,1e-06,100,"{'alpha_1': 1e-06, 'alpha_2': 1, 'lambda_1': 1...",0.2532,0.488828,0.636298,0.657784,0.596447,0.526511,0.148523,1
122,0.003227,0.001106,0.000502,8.9e-05,1e-06,1,1,1e-06,300,"{'alpha_1': 1e-06, 'alpha_2': 1, 'lambda_1': 1...",0.2532,0.488828,0.636298,0.657784,0.596447,0.526511,0.148523,1
123,0.003581,0.000443,0.000556,1.7e-05,1e-06,1,1,1e-06,500,"{'alpha_1': 1e-06, 'alpha_2': 1, 'lambda_1': 1...",0.2532,0.488828,0.636298,0.657784,0.596447,0.526511,0.148523,1
121,0.003742,0.000885,0.000574,1.6e-05,1e-06,1,1,1e-06,200,"{'alpha_1': 1e-06, 'alpha_2': 1, 'lambda_1': 1...",0.2532,0.488828,0.636298,0.657784,0.596447,0.526511,0.148523,1
124,0.003671,0.000883,0.000516,6.4e-05,1e-06,1,1,0.0001,100,"{'alpha_1': 1e-06, 'alpha_2': 1, 'lambda_1': 1...",0.2532,0.488828,0.636298,0.657784,0.596447,0.526511,0.148523,5


In [7]:
# Obtener el mejor modelo

ml_model = cv.best_estimator_

In [8]:
# Predecir en test

y_pred = ml_model.predict(X_test)

# Evaluar el error en test

print('R2 obtenido en test: ' + str(r2_score(y_test, y_pred)))

R2 obtenido en test: 0.6332188357848467


In [9]:
# GENERACION DE PREDICCIONES EN EL FICHERO DE EVALUACION

evalu = pd.read_csv('eval.csv')

evalu.head()

Unnamed: 0,id,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,12,2013.333,6.3,90.45606,9,24.97433,121.5431
1,255,2012.667,1.1,329.9747,5,24.98254,121.54395
2,219,2013.417,13.6,492.2313,5,24.96515,121.53737
3,132,2013.5,4.0,2147.376,3,24.96299,121.51284
4,102,2012.833,12.7,170.1289,1,24.97371,121.52984


In [10]:
# Se aparta la columna de id

ids = evalu['id']
X_eval = evalu.drop('id', 1)

X_eval.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,2013.333,6.3,90.45606,9,24.97433,121.5431
1,2012.667,1.1,329.9747,5,24.98254,121.54395
2,2013.417,13.6,492.2313,5,24.96515,121.53737
3,2013.5,4.0,2147.376,3,24.96299,121.51284
4,2012.833,12.7,170.1289,1,24.97371,121.52984


In [11]:
# Se aplica el mismo preprocesamiento que al dataset original

X_eval = sc.transform(X_eval)

In [12]:
# Se calculan las predicciones

y_pred = ml_model.predict(X_eval)

In [13]:
# Se genera el nuevo csv con las predicciones

result = np.vstack([ids, y_pred]).T # Se pegan las predicciones a los ids


df = pd.DataFrame(result)
df = df.astype({0: 'int32'}) # Se cambia el tipo de la primera columna para que sea entero

# Se utiliza como nombre de la variable a predecir el mismo nombre que en el dataset original
df.to_csv('prediccion.csv', header=['id', dataset.columns.values[-1]], index = False)