# Import Libraries

In [None]:
# perform standard imports
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, scale
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
from operator import itemgetter
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot') # emulate R's pretty plotting

# print numpy arrays with precision 4
#np.set_printoptions(precision=4)

# Load Data

In [None]:
#chnages all nan to 0's
housingData = pd.read_csv("Amestrain.csv", error_bad_lines=False).fillna(0)
numericalData = housingData.select_dtypes(include=['int64','float64'])
numericalHeaders = list(numericalData)
nonNumericalData = housingData.drop(numericalHeaders, axis=1)
nonNumericalDataHeaders = list(nonNumericalData)
dummies = pd.get_dummies(housingData[nonNumericalDataHeaders])
housingData = housingData.drop(nonNumericalData, axis=1)
housingData = pd.concat([housingData,dummies],axis=1)
housingData

# Correlation

In [None]:
correlationMatrix = housingData.corr()
priceCorrelation = correlationMatrix['SalePrice']
# Gathering terms with correlation greater than .4
filtCorr = priceCorrelation > .4
filtData = priceCorrelation[filtCorr]

In [None]:
#get list of items
corrData = []
for row in filtData.index:
    corrData.append(row)
    
# create data frame with correlated data
Xcorr = housingData[corrData]
Xcorr = Xcorr.drop(columns=['SalePrice'])

# Build Model

In [None]:
#Seperates what we want to predict from features
X = housingData.drop(columns=['SalePrice'])
Y = housingData['SalePrice']

#creates empty list for regressions
regression = []

#creates array of alpha values
alphas = 10**np.linspace(-4,4,200)

#build model
for index, alpha in enumerate(alphas):
    model = Ridge(alpha=alpha, normalize=True, fit_intercept=True)
    regression = np.append(regression, model.fit(Xcorr,Y))

# Plotting Regression Coefficients vs. Ridge Penalty 

In [None]:
ridge_coef = np.empty((len(alphas), Xcorr.shape[1]))

# Extract ridge coefficients
for index, model in enumerate(regression):
    ridge_coef[index] = model.coef_[np.newaxis]
    

# Make plot
fig, ax = plt.subplots(figsize=(14,7))
ax.plot(alphas,ridge_coef,linewidth=1.5);
ax.set_xscale('log')
ax.set_xlabel('Alpha (log-scale)',fontsize=13)
ax.set_ylabel('Ridge Coefficient', fontsize = 13)
ax.set_title('Correlated Features', fontsize = 18)

# Cross Validation to determine best alpha

In [None]:
np.random.seed(0)
# split data into a train and validation set of equal sizes
train = np.random.choice([True, False], size = len(housingData))

X_train = Xcorr[train]
Y_train = Y[train]
Y_test = Y[~train]
X_test = Xcorr[~train]

In [None]:
# construct training and test folds
kf = KFold(n_splits=10)
kf.get_n_splits(Xcorr)

In [None]:
cvs = []

for alpha in alphas:
    error = []
    
    for train, test in kf.split(X):
        X_train = Xcorr.values[train]
        Y_train = Y.values[train]
        X_test = Xcorr.values[test]
        Y_test = Y.values[test]
        
        model = Ridge(alpha=alpha, normalize = True).fit(X_train,Y_train)
        
        error = np.append(error, (model.predict(X_test)-Y_test))
        
    cvs = np.append(cvs, np.mean(error**2))
    
#find minimum alpha
min_index, min_cvs = min(enumerate(cvs), key=itemgetter(1))

In [None]:
#make plot
fig,ax = plt.subplots(figsize=(8,6))

ax.plot(alphas, cvs, color="b")
ax.plot(alphas[min_index],min_cvs, marker= "o", color='r',markersize=12)

ax.set_xscale('log')
ax.set_ylabel('CV Error', fontsize=13)
ax.set_xlabel('alpha', fontsize = 13)

# Find Coefficient Matrix

In [None]:
regression = Ridge(alpha=alphas[min_index], normalize = True).fit(Xcorr,Y)
coefficients = pd.Series(data = np.hstack([regression.intercept_,regression.coef_]), index = ['Intercept'] + Xcorr.columns.tolist())
coefficients

In [None]:
grid = 10**np.linspace(-4,2,100)

np.random.seed(0)

train = np.random.choice([True, False], size=len(housingData))

X_Train = X[train]
Y_Train = Y[train]
X_Test = X[~train]
Y_Test = Y[~train]

coeffecients = np.empty((len(grid), X.shape[1]))

for index, alpha in enumerate(grid):
    lasso = Lasso(alpha=alpha, normalize = True, max_iter=50000)
    pipeline=Pipeline([('lasso', lasso)])
    pipeline.fit(X_Train, Y_Train)
    coeffecients[index] = lasso.coef_[np.newaxis]
coeffecients

In [None]:
fig, ax = plt.subplots(figsize =(12,5))

ax.plot(grid, coeffecients, linewidth = 2)

ax.set_xscale('log')
ax.set_xlabel('alpha (log-scale)', fontsize =14)
ax.set_ylabel('Lasso Coefficients', fontsize=14)
