In [None]:
#Import packages to be used
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
# Imports raw data
housing_import = pd.read_csv('trimmed_data.csv', index_col='Id')

In [None]:
# Cleaning out NaN values remaining
housing_import = housing_import[housing_import.MasVnrType.isnull() == False]
housing = housing_import.drop('MiscFeature', axis=1)

In [None]:
pd.get_dummies(housing)

My interpretation of the get dummies function is that it took each categorical variable and added a variable to the dataframe for each category and populated that variable with a binary to capture datapoints from that category. 

In [None]:
housing_dummies = pd.get_dummies(housing)

In [None]:
# Creating Target and Feature Arrays
# .values returns numpy array instead of dataframe
X = housing_dummies.drop('SalePrice', axis=1).values
y = housing_dummies.SalePrice.values



In [None]:
# Unsure as what I need as far as reshaping goes
X.shape
y.shape

y = y.reshape(-1, 1)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [None]:
X_train.shape

In [None]:
# Creating a dataframe for performance metrics
d = {'R_Squared': [], 'RMSE': [], 'MAPE': []}
metrics = pd.DataFrame(data=d)
metrics.index.name = 'Model'
metrics

In [None]:
# Create LinearRegression Object
reg = LinearRegression()

In [None]:
# Fit the data
reg.fit(X_train, y_train)

In [None]:
# Create Predictions based on fit
y_pred_test = reg.predict(X_test)
y_pred_train = reg.predict(X_train)

In [None]:
# Performance Metrics for training set
print("R^2: {}".format(reg.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-y_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [reg.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Linear_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
# Performance Metrics for test set
print("R^2: {}".format(reg.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-y_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [reg.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Linear_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
metrics

In [None]:
# Applying Ridge Regression

# Train test split -- Random state 42 should produce same split as above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

# Creating Ridge object, fitting, and predicting. Normalize = true ensures all variables are on the same scale
ridge = Ridge(alpha=0.1, normalize = True)
ridge.fit(X_train, y_train)
ridge_pred_test = ridge.predict(X_test)
ridge_pred_train = ridge.predict(X_train)


In [None]:
# Metrics for train set
print("R^2: {}".format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, ridge_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-ridge_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
# Performance Metrics for test set
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-ridge_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
metrics

In [None]:
# Applying Lasso Regression

# Train test split -- Random state 42 should produce same split as above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

lasso = Lasso(alpha = 0.1, normalize = True)
lasso.fit(X_train, y_train)
lasso_pred_test = lasso.predict(X_test)
lasso_pred_train = lasso.predict(X_train)

In [None]:
# Performance metrics for train set
print("R^2: {}".format(lasso.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, lasso_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-lasso_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [lasso.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Lasso_Train'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
# Performance Metrics for test set
print("R^2: {}".format(lasso.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-lasso_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [lasso.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Lasso_Test'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
metrics

In [None]:
# Predicted vs actual scatterplot
plt.scatter(ridge_pred_test, y_test, alpha=0.5, s=4)
plt.title('Predicted vs. Actual Sale price')
plt.xlabel('Predicted Sale Price')
plt.ylabel('Actual Sale Price')
plt.plot(list(range(y_test.max())),list(range(y_test.max())), c='r')
plt.show()

In [None]:
# Predicted vs residuals
residuals = ridge_pred_test-y_test
plt.scatter(ridge_pred_test, residuals, alpha = 0.5, s=4)
plt.plot(list(range(y_test.max())),[0]*y_test.max(), c='r')
plt.show()

In [None]:
# Histogram of Residuals
mean = np.mean(residuals)
std_dev = np.std(residuals)

plt.hist(residuals, bins=20)
plt.axvline(mean, color='r')
plt.axvline(mean + std_dev, color='r', linestyle='--')
plt.axvline(mean + 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean + 3* std_dev, color='r', linestyle=':')
plt.axvline(mean - std_dev, color='r', linestyle='--')
plt.axvline(mean - 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean - 3* std_dev, color='r', linestyle=':')
plt.title('Residuals')
plt.ylabel('Frequency')
plt.show()

In [None]:
residuals[residuals > mean + 3* std_dev]

In [None]:
residuals[residuals < mean - 3* std_dev]

In [None]:
# Setup the hyperparameter grid
alpha_space = list(np.arange(11)/10)
param_grid = {'alpha': alpha_space}

# Instantiate a ridge regression classifier: ridge
ridge = Ridge()

# Instantiate the GridSearchCV object: logreg_cv
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)

# Fit it to the data
ridge_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(ridge_cv.best_params_)) 
print("Best score is {}".format(ridge_cv.best_score_))

In [None]:
ridge_cv.cv_results_

In [None]:
# Applying Ridge Regression with different alpha based on grid search CV results

# Train test split -- Random state 42 should produce same split as above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

# Creating Ridge object, fitting, and predicting. Normalize = true ensures all variables are on the same scale
ridge = Ridge(alpha=1, normalize = True)
ridge.fit(X_train, y_train)
ridge_pred_test = ridge.predict(X_test)
ridge_pred_train = ridge.predict(X_train)


In [None]:
# Metrics for train set
print("R^2: {}".format(ridge.score(X_train, y_train)))
rmse = np.sqrt(mean_squared_error(y_train, ridge_pred_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train-ridge_pred_train)/y_train * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_train, y_train)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Train_a=1'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
# Performance Metrics for test set
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test-ridge_pred_test)/y_test * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_test, y_test)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Test_a=1'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
metrics

In [None]:
# Rerunning graphics with different ridge model

# Predicted vs actual scatterplot
plt.scatter(ridge_pred_test, y_test, alpha=0.5, s=4)
plt.title('Predicted vs. Actual Sale price')
plt.xlabel('Predicted Sale Price')
plt.ylabel('Actual Sale Price')
plt.plot(list(range(y_test.max())),list(range(y_test.max())), c='r')
plt.show()

In [None]:
# Predicted vs residuals
residuals = ridge_pred_test-y_test
residuals_train = ridge_pred_train-y_train
plt.scatter(ridge_pred_test, residuals, alpha = 0.5, s=4)
plt.plot(list(range(y_test.max())),[0]*y_test.max(), c='r')
plt.show()

In [None]:
# Histogram of Residuals
mean = np.mean(residuals)
std_dev = np.std(residuals)

mean_train = np.mean(residuals_train)
std_dev_train = np.std(residuals_train)

plt.hist(residuals, bins=20)
plt.axvline(mean, color='r')
plt.axvline(mean + std_dev, color='r', linestyle='--')
plt.axvline(mean + 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean + 3* std_dev, color='r', linestyle=':')
plt.axvline(mean - std_dev, color='r', linestyle='--')
plt.axvline(mean - 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean - 3* std_dev, color='r', linestyle=':')
plt.title('Residuals')
plt.ylabel('Frequency')
plt.show()

In [None]:
residuals[residuals > mean + 3* std_dev]

In [None]:
residuals[residuals < mean - 3* std_dev]

In [None]:
# A set of all indicies of high outlier points
high_residuals = np.where(abs(residuals) > mean + 3* std_dev)
high_residuals_train = np.where(abs(residuals_train) > mean_train + 3*std_dev_train)

In [None]:
# Question for removing outliers. Should I be finding and removing outliers from the train set or the test set? Both? 


# Dropping Outliers
X_test_no_outliers = np.delete(X_test, high_residuals[0], axis=0)
y_test_no_outliers = np.delete(y_test, high_residuals[0], axis=0)
X_train_no_outliers = np.delete(X_train, high_residuals_train[0], axis=0)
y_train_no_outliers = np.delete(y_train, high_residuals_train[0], axis=0)

In [None]:
X_test_no_outliers.shape

In [None]:
y_test_no_outliers.shape

In [None]:
high_residuals[0]

In [None]:
high_residuals_train[0]

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
X_train_no_outliers.shape

In [None]:
X_train_no_outliers.shape

In [None]:
y_train_no_outliers.shape

In [None]:
# Applying Ridge Regression with outliers removed

# Creating Ridge object, fitting, and predicting. Normalize = true ensures all variables are on the same scale
ridge = Ridge(alpha=1, normalize = True)
ridge.fit(X_train_no_outliers, y_train_no_outliers)
ridge_pred_no_test = ridge.predict(X_test_no_outliers)
ridge_pred_no_train = ridge.predict(X_train_no_outliers)


In [None]:
# Metrics for train set
print("R^2: {}".format(ridge.score(X_train_no_outliers, y_train_no_outliers)))
rmse = np.sqrt(mean_squared_error(y_train_no_outliers, ridge_pred_no_train))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_train_no_outliers-ridge_pred_no_train)/y_train_no_outliers * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_train_no_outliers, y_train_no_outliers)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Train_No_Outliers'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
# Performance Metrics for test set
print("R^2: {}".format(ridge.score(X_test_no_outliers, y_test_no_outliers)))
rmse = np.sqrt(mean_squared_error(y_test_no_outliers, ridge_pred_no_test))
print("Root Mean Squared Error: {}".format(rmse))
error = (y_test_no_outliers-ridge_pred_no_test)/y_test_no_outliers * 100
mape = abs(error).mean()
print('Mean Absolute Percent Error: {}'.format(mape))

In [None]:
# Adding metrics to the metrics dataframe
d = {'R_Squared': [ridge.score(X_test_no_outliers, y_test_no_outliers)], 'RMSE': [rmse], 'MAPE': [mape]}
temp = pd.DataFrame(data=d, index=['Ridge_Test_No_Outliers'])
temp.index.name = 'Model'
metrics = metrics.append(temp)

In [None]:
metrics

In [None]:
# Rerunning graphics with No_outliers model

# Predicted vs actual scatterplot
plt.scatter(ridge_pred_no_test, y_test_no_outliers, alpha=0.5, s=4)
plt.title('Predicted vs. Actual Sale price')
plt.xlabel('Predicted Sale Price')
plt.ylabel('Actual Sale Price')
plt.plot(list(range(y_test_no_outliers.max())),list(range(y_test_no_outliers.max())), c='r')
plt.show()

In [None]:
# Predicted vs residuals
residuals = ridge_pred_no_test-y_test_no_outliers
residuals_train = ridge_pred_no_train-y_train_no_outliers
plt.scatter(ridge_pred_no_test, residuals, alpha = 0.5, s=4)
plt.plot(list(range(y_test_no_outliers.max())),[0]*y_test_no_outliers.max(), c='r')
plt.show()

In [None]:

# Histogram of Residuals

mean = np.mean(residuals)
std_dev = np.std(residuals)

mean_train = np.mean(residuals_train)
std_dev_train = np.std(residuals_train)

plt.hist(residuals, bins=20)
plt.axvline(mean, color='r')
plt.axvline(mean + std_dev, color='r', linestyle='--')
plt.axvline(mean + 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean + 3* std_dev, color='r', linestyle=':')
plt.axvline(mean - std_dev, color='r', linestyle='--')
plt.axvline(mean - 2* std_dev, color='r', linestyle='-.')
plt.axvline(mean - 3* std_dev, color='r', linestyle=':')
plt.title('Residuals')
plt.ylabel('Frequency')
plt.show()