In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as msngo

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error, median_absolute_error

from cv_error import *

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('seaborn-darkgrid')

In [None]:
df = pd.read_csv('Datasets/kc_house_data.csv')
df.head()

In [None]:
df.drop(['zipcode', 'id', 'date'], axis = 1, inplace = True)
df.head()

In [None]:
print(f'''La base de datos se compone de las siguientes observaciones
Numero de filas: {df.shape[0]}
Numero de columnas: {df.shape[1]}
''')

In [None]:
df.info()

In [None]:
msngo.matrix(df)

In [None]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df_scaled['price'] = np.log(df['price'])

In [None]:
fig, ax = plt.subplots(1, 2)
sns.distplot(df['price'], bins = 'fd', ax = ax[0])
ax[0].set_title('Histograma del Precio')
sns.distplot(df_scaled['price'], bins = 'fd', ax = ax[1])
ax[1].set_title('Histograma Logaritmo del Presio')

#### Eleccion de hiperparametros

In [None]:
X = df_scaled.iloc[:, 1:]
N = X.shape[0]
X.insert(X.shape[1], 'intercept', np.ones(N))
y = df_scaled['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state = 15820)

In [None]:
alphas = np.logspace(0, 7, base = 10)
alphas

In [None]:
names_regressors = X_train.columns

In [None]:
coefs_ridge = cv_err_ridge = [], []
model_ridge = Ridge(fit_intercept = True)

for a in alphas:
        model_ridge.set_params(alpha = a)
        model_ridge.fit(X_train, y_train)
        coefs_ridge.append(model_ridge.coef_)
        dummy, cv_err_estimates = cv_error(X_train, y_train, k = 10, method = 'ridge', alpha = a)
        cv_err_ridge.append(np.mean(cv_err_estimates))