In [1]:
## imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

In [5]:
## loading testing and training dataframes
train_df = pd.read_csv('../data/complete_test_df')
test_df = pd.read_csv('../data/complete_train_df')

In [None]:
## set Id column to be our new index for both train and test set
train_df.set_index("Id", inplace=True)
test_df.set_index("Id", inplace=True)

In [None]:
train_df.head()

### First Model: Linear Regression

In [None]:
X = train_df.drop(columns=['saleprice'])
y = train_df['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2021)

In [None]:
## scaling the data for LASSO and Ridge models
sc = StandardScaler()

Z_train = sc.fit_transform(X_train)
Z_test = sc.fit_transform(X_test)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
print(f'Training score: {lr.score(X_train, y_train)}')

In [None]:
print(f'Testing score: {lr.score(X_test, y_test)}')

In [None]:
## predictions for training and testing data split from training DF
train_preds = lr.predict(X_train)
test_preds  = lr.predict(X_test)

In [None]:
## calculating RMSE for training and testing data split from training DF
print(f'RMSE for training: {mean_squared_error(y_train, train_preds, squared=False)}')

In [None]:
print(f'RMSE for testing: {mean_squared_error(y_test, test_preds, squared=False)}')

In [None]:
cross_val_score(lr, X,y).mean()

In [None]:
lr.coef_

In [None]:
## creating coeff dataframe

lr_coef = pd.DataFrame({
    "column_name": X.columns,
    "coef": lr.coef_,
    "abs_coef": abs(lr.coef_)
})

lr_coef

In [None]:
# Plot top 30 features (sorted by absolute regression coefficient)
plt.figure(figsize=(12,10))
data = lr_coef.sort_values(by='abs_coef', ascending=False).head(30)[['column_name', 'coef']] \
               .sort_values(by='coef', ascending=False).reset_index(drop=True)
    
ax = sns.barplot(data=lr_coef, y='column_name', x='coef', orient='h', palette='Spectral')
ax.set_ylabel('')
ax.set_yticklabels(lr_coef['column_name'], size=11)
ax.set_xlabel('Linear Regression Coefficient', fontsize=14)
plt.title('Top 30 Housing Features', fontsize=20);

In [None]:
## plotting residuals: y- y_hat
plt.scatter(train_preds, train_preds - y_train, c = "green", marker = "o", label = "Training data", alpha=0.6)
plt.scatter(test_preds, test_preds - y_test, c = "orange", marker = "o", label = "Testing data", alpha=0.6)
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 0, xmax = 500_000, color = "red");

In [None]:
## plotting predicted values vs actual values

sns.regplot(data=X_train, x=test_preds, y=y_test, marker='o', color='orange', line_kws={'color':'red'})
plt.xlabel('Predicted Sale Price', fontsize=14)
plt.ylabel('Actual Sale Price', fontsize=14)
plt.title('Predictions of Sale Price vs Actual Sale Price', fontsize=15);

### Ridge Model

In [None]:
## creat a list of alphas to check 
ridge_alpha = np.logspace(0,3,100)

## cross validate over our list of alphas
ridge_cv = RidgeCV(alphas = ridge_alpha, scoring = 'r2', cv =5)

## fit model using best ridge alpha
ridge_cv.fit(Z_train, y_train)

In [None]:
## checking for the optimal alpha value
ridge_cv.alpha_

In [None]:
##checking r2 scores for training
ridge_cv.score(Z_train, y_train)

In [None]:
##checking r2 scores for testing
ridge_cv.score(Z_test, y_test)

In [None]:
ridge_cv.best_score_

In [None]:
z_preds_test = ridge_cv.predict(Z_test)
z_preds_train = ridge_cv.predict(Z_train)

In [None]:
print(f'RMSE for training: {mean_squared_error(y_train, z_preds_train, squared=False)}')

In [6]:
print(f'RMSE for testing: {mean_squared_error(y_test, z_preds_test, squared=False)}')

NameError: name 'y_test' is not defined

In [None]:
## plotting residuals: y- y_hat
plt.figure(figsize=(8,8))
plt.scatter(z_preds_train, z_preds_train - y_train, c = "green", marker = "o", label = "Training data", alpha=0.6)
plt.scatter(z_preds_test, z_preds_test - y_test, c = "orange", marker = "o", label = "Testing data", alpha=0.6)
plt.title("Linear regression with Ridge")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 0, xmax = 700_000,color = "red");

In [None]:
## plotting predicted values vs actual values

plt.figure(figsize=(8,8))
sns.regplot(data=X, x=z_preds_test, y=y_test, marker='o', color='orange', line_kws={'color':'red'})
plt.xlabel('Predicted Sale Price', fontsize=14)
plt.ylabel('Actual Sale Price', fontsize=14)
plt.title('Predictions of Sale Price vs Actual Sale Price', fontsize=20);

### LASSO Model

In [None]:
lasso_alphas = np.logspace(-3,0,100)
lasso_cv = LassoCV(alphas= lasso_alphas, cv =5, max_iter = 10_000_000, tol = 0.000000000001)

lasso_cv.fit(Z_train,y_train)

In [None]:
lasso_cv.alpha_

In [None]:
lasso_cv.score(Z_train, y_train)

In [None]:
lasso_cv.score(Z_test, y_test)

In [None]:
z_preds_test = lasso_cv.predict(Z_test)
z_preds_train = lasso_cv.predict(Z_train)

In [None]:
print(f'RMSE for training: {mean_squared_error(y_train, z_preds_train, squared=False)}')

In [None]:
print(f'RMSE for testing: {mean_squared_error(y_test, z_preds_test, squared=False)}')

In [None]:
## plotting residuals: y- y_hat
plt.figure(figsize=(8,8))
plt.scatter(z_preds_train, z_preds_train - y_train, c = "green", marker = "o", label = "Training data", alpha=0.6)
plt.scatter(z_preds_test, z_preds_test - y_test, c = "orange", marker = "o", label = "Testing data", alpha=0.6)
plt.title("Linear regression with LASSO")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 0, xmax = 700_000,color = "red");

In [None]:
## plotting predicted values vs actual values

plt.figure(figsize=(8,8))
sns.regplot(data=X, x=z_preds_test, y=y_test, marker='o', color='orange', line_kws={'color':'red'})
plt.xlabel('Predicted Sale Price', fontsize=14)
plt.ylabel('Actual Sale Price', fontsize=14)
plt.title('Predictions of Sale Price vs Actual Sale Price', fontsize=20);

### Baseline model

In [None]:
## baseline predictions
baseline_train_preds = [y_train.mean()]*len(y_train)
baseline_test_preds  = [y_train.mean()]*len(y_test)

In [None]:
y_train_baseline = [y_train.mean()] *len(y_train)
print(f'Baseline RMSE Train: {mean_squared_error(y_train, y_train_baseline, squared=False)}')

In [None]:
y_test_baseline = [y_train.mean()]*len(y_test)
print(f'Baseline RMSE Test: {mean_squared_error(y_test, y_test_baseline, squared=False)}')