In [13]:
% matplotlib inline

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = 10, 10

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
df = pd.read_csv('properati_caballito_train.csv')
cols = ['Unnamed: 0', 'Unnamed: 0.1','place_name', 'precio_m2_usd', 'expenses', 'description', 'title', 'property_type']
df.drop(cols, axis=1, inplace=True)

In [15]:
# DF
df = df.dropna()
print(df.columns) 
print(df.shape)
df.head(5)

Index(['lat', 'lon', 'price_aprox_usd', 'surface_total_in_m2',
       'price_usd_per_m2', 'dummy_property_type__apartment',
       'dummy_property_type__house', 'dummy_property_type__store', 'distSubte',
       'distParque', 'dummy_pileta', 'dummy_balcon', 'dummy_patio',
       'dummy_lavadero', 'dummy_cochera', 'dummy_luminoso', 'dummy_terraza',
       'dummy_quincho', 'dummy_baulera', 'dummy_parrilla', 'dummy_premium',
       'dummy_piscina', 'dummy_ascensor', 'dummy_profesional', 'dummy_alarma',
       'dummy_amenities', 'dummy_calefaccion', 'dummy_pozo', 'dummy_gimnasio',
       'dummy_aire acondicionado', 'dummy_spa', 'dummy_jacuzzi', 'dummy_cine'],
      dtype='object')
(1410, 33)


Unnamed: 0,lat,lon,price_aprox_usd,surface_total_in_m2,price_usd_per_m2,dummy_property_type__apartment,dummy_property_type__house,dummy_property_type__store,distSubte,distParque,...,dummy_profesional,dummy_alarma,dummy_amenities,dummy_calefaccion,dummy_pozo,dummy_gimnasio,dummy_aire acondicionado,dummy_spa,dummy_jacuzzi,dummy_cine
0,-34.622211,-58.439128,110000,50,2200.0,1,0,0,0.274837,0.257798,...,0,0,0,0,0,0,0,0,0,0
1,-34.623894,-58.446861,170500,100,1705.0,1,0,0,0.168539,0.471023,...,0,0,0,0,0,0,0,0,0,0
2,-34.620928,-58.445874,220410,81,2721.111111,1,0,0,0.384946,0.137809,...,0,0,1,0,0,0,0,0,0,0
3,-34.620928,-58.445874,222780,81,2750.37037,1,0,0,0.384946,0.137809,...,0,0,1,0,0,0,0,0,0,0
4,-34.628786,-58.427783,108000,70,1542.857143,1,0,0,0.216992,1.185256,...,0,0,0,0,0,0,0,0,0,0


In [16]:
dfX = df[['lat', 'lon', 'surface_total_in_m2', 'dummy_property_type__apartment',
       'dummy_property_type__house', 'dummy_property_type__store', 'distSubte',
       'distParque', 'dummy_pileta', 'dummy_balcon', 'dummy_patio',
       'dummy_lavadero', 'dummy_cochera', 'dummy_luminoso', 'dummy_terraza',
       'dummy_quincho', 'dummy_baulera', 'dummy_parrilla', 'dummy_premium',
       'dummy_piscina', 'dummy_ascensor', 'dummy_profesional', 'dummy_alarma',
       'dummy_amenities', 'dummy_calefaccion', 'dummy_pozo', 'dummy_gimnasio',
       'dummy_aire acondicionado', 'dummy_spa', 'dummy_jacuzzi', 'dummy_cine']]

In [17]:
y = df['price_usd_per_m2']

In [18]:
X = StandardScaler().fit_transform(dfX)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=53)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(987, 31) (987,)
(423, 31) (423,)


In [20]:
# Generamos un grid de $\alpha$ para probar e instanciamos un particionador del Training Set 
# en K partes para realizar la validación cruzada

al_ridge = np.linspace(0.001, 2, 300)
al_lasso = np.linspace(0.001, 2, 300)
kf = KFold(n_splits=5, shuffle=True, random_state=12)

# Instanciamos los modelos

lm = LinearRegression()
lmRidgeCV = RidgeCV(alphas=[0.1], cv=kf, normalize=False)
lmLassoCV = LassoCV(alphas=al_lasso, cv=kf, normalize=False)

In [9]:
#lmRidgeCV = RidgeCV(fit_intercept=False, alphas=[0.1], cv=kf, normalize=False)

In [21]:
# Hacemos los fits respectivos

lm.fit(X_train, y_train)
lmRidgeCV.fit(X_train, y_train)
lmLassoCV.fit(X_train, y_train)

LassoCV(alphas=array([1.00000e-03, 7.68562e-03, ..., 1.99331e+00, 2.00000e+00]),
    copy_X=True, cv=KFold(n_splits=5, random_state=12, shuffle=True),
    eps=0.001, fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [22]:
print('Alpha Ridge:',lmRidgeCV.alpha_,'\n'
      'Alpha LASSO:',lmLassoCV.alpha_,'\n')

Alpha Ridge: 0.1 
Alpha LASSO: 2.0 



In [23]:
# Calculamos el R2

print("Score Train Lineal:", lm.score(X_train, y_train),"\n"
      "Score Train Ridge:",  lmRidgeCV.score(X_train, y_train),"\n"
      "Score Train Lasso:",  lmLassoCV.score(X_train, y_train))

# Calculamos el MSE

lmpred_Tr = lm.predict(X_train)
lmRidgepred_Tr = lmRidgeCV.predict(X_train)
lmLassoepred_Tr = lmLassoCV.predict(X_train)

print("Train MSE lineal=", mean_squared_error(y_train,lmpred_Tr), "\n"
      "Train MSE Ridge=",  mean_squared_error(y_train,lmRidgepred_Tr), "\n"
      "Train MSE Lasso=",  mean_squared_error(y_train,lmLassoepred_Tr))

Score Train Lineal: 0.11577989307396487 
Score Train Ridge: 0.1157798426901787 
Score Train Lasso: 0.11551583266697651
Train MSE lineal= 1564594.25235679 
Train MSE Ridge= 1564594.3415090065 
Train MSE Lasso= 1565061.497324206
