# E5-1 Predicting House Prices by Regression
This example demonstrates machine learning by regression.
We will be using three regression algorithms:
- linear regression
- multiple linear regression
- polynomial regression

The dataset comes from King County, USA (kaggle.com).

## Step 1: Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics as sm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D

## Step 2: Data

In [None]:
# read the data
df = pd.read_csv("../../data/kc_house_data.csv")

In [None]:
df

In [None]:
# get idea of the look
df.head()

In [None]:
# convert sqft to sqm
df['sqm'] = df['sqft_living']/10.764

In [None]:
df.describe()

In [None]:
# see which are the attribute labels
list(df)

In [None]:
df.info()

In [None]:
# get idea of columns and types
df.info()

In [None]:
# get idea of basic statistical parameters for each column
df.describe()

In [None]:
# to avoid scientific notation, e.g. e+04
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
df.describe()

In [None]:
# to check null values in data
df.isnull().sum()

Above representation shows that our data does not have any null values

In [None]:
# same visualized
sns.heatmap(df.isnull(), yticklabels= False, cbar=False, cmap= 'viridis')

In [None]:
df['bedrooms'].value_counts().plot(kind='bar')

In [None]:
# same, with anmother plot lib
sns.countplot(df.bedrooms, order = df['bedrooms'].value_counts().index)

In [None]:
# Did the size matter?
ranges = [0,100,200,300,400,500,600,700,800, 900, 1000, 1100, 1200, 1300] 
df['sqm'].groupby([(pd.cut(df.sqm, ranges))]).count()

In [None]:
sns.distplot(df['sqm'],  label='sqm_living', norm_hist=True)  

In [None]:
df['waterfront'].value_counts()

163 houses which have a view to a waterfront

In [None]:
df['view'].value_counts()

19489 flats have not been viewed at all, whereas 319 flats have been viewed 4 times

### Investigate the inter-dependencies of the columns
Create a correlation matrix to see which features determine the output at most, as well as whether there are some correlated features. <br>
If two features are correlated, only one of them can represent both.

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
# plot the matrix as a heat map
plt.subplots(figsize = (16, 12))
sns.heatmap(corr_matrix, annot=True)

## Step 3: Train a Model

## 3.1 Linear Regression

In [None]:
# split the set into subsets for training and testing
train_data, test_data = train_test_split(df, train_size =0.8, random_state = 3)

In [None]:
# We chose the living space in sqm as an input and price as an output
# reshape(-1,1) reshapes the column in one-dimensional array (1 column) and unknown number of rows (-1)
X_train = np.array(train_data['sqm']).reshape(-1,1)
y_train = np.array(train_data['price']).reshape(-1, 1)
X_test = np.array(test_data['sqm']).reshape(-1,1)
y_test = np.array(test_data['price']).reshape(-1, 1)

In [None]:
y_train

In [None]:
# plot the train set
plt.xlabel('Living Space')
plt.ylabel('price')

plt.scatter(X_train, y_train, color='brown')

plt.show()

In [None]:
# plot the test set
plt.xlabel('Living Space')
plt.ylabel('price')

plt.scatter(X_test, y_test, color='green')

plt.show()

In [None]:
# Create regressor
regressor = linear_model.LinearRegression()

In [None]:
# Train model
model = regressor.fit(X_train, y_train)

## Step 4: Test the Model

In [None]:
# make prediction
y_predicted = regressor.predict(X_test)

In [None]:
y_predicted

In [None]:
y_test

In [None]:
# plot the prediction
plt.xlabel('Living Space')
plt.ylabel('price')
plt.legend()

plt.scatter(X_test, y_test, color='green', label = 'test data')
plt.plot(X_test, y_predicted, color='red', linewidth=2, label= 'predicted regression line')

# y_predicted, X_test, 
plt.show()

In [None]:
# Compute the coeficients a and b
# so the formula would be y = ax + b
a = regressor.coef_
b = regressor.intercept_
print('Coefficient a: ', a)
print('Intercept b: ', b)

In [None]:
# Compute performance metrics
print("Linear Regressor Performance")

# The mean squared error
print("Mean squared error: %.2f" % sm.mean_squared_error(y_test, y_predicted))

# Explained variance score: 1 is perfect prediction
print('Explained variance score ', round(sm.explained_variance_score(y_test, y_predicted), 2))
print('R-squared (training) ', round(regressor.score(X_train, y_train), 2))
print('R-squared (testing) ',  round(regressor.score(X_test, y_test), 2))
print('R2 score: %.2f' % sm.r2_score(y_test, y_predicted))

In [None]:
plt.scatter(X_test, y_test, color= 'darkgreen', label = 'data')
plt.plot(X_train, regressor.predict(X_train), color='red', label= ' Predicted Regression line')
plt.xlabel('Living Space (sqm)')
plt.ylabel('price')
plt.legend()

## 3.2 Multiple Regression
This is a linear regression, where more input features participate.<br>
_y = c + a1 x X1 + a2 x X2 + a3 x X3 + a4 x X4 + a5 x X5 + a6 x X6_

In [None]:
# select features
features1 = ['bedrooms','bathrooms','grade']
regressor1 = linear_model.LinearRegression()

In [None]:
regressor1.fit(train_data[features1],train_data['price'])

In [None]:
y_prediction = regressor1.predict(test_data[features1])

In [None]:
# Compute performance metrics
print("Multiple Regression Performance")

# The coefficients
print('Coefficients ai (one for each input variable/feature): ', regressor1.coef_)
print('Intercept c: ', regressor1.intercept_)

In [None]:
# The mean squared error
print("Mean squared error: %.2f" % sm.mean_squared_error(y_test, y_prediction))

# Explained variance score: 1 is perfect prediction
print('R-squared (training) ', round(regressor1.score(train_data[features1], train_data['price']), 3))
print('R-squared (testing) ', round(regressor1.score(test_data[features1], test_data['price']), 3))
print('Explained variance score ', round(sm.explained_variance_score(y_test, y_prediction), 2))
print('R2 score: %.2f' % sm.r2_score(y_test, y_prediction))

__Exercise__: Try another combination of features

## 3.3 Polynomial Regression 
It is a form of regression, in which the relationship between the independent variable __X__ and the dependent variable __y__ is modelled as an __n-th degree__ polynomial in x.<br>
_y = a + b1 x X + b2 x X^2 + ... + bn x X^n_

In [None]:
y_train_poly = np.array(train_data['price']).reshape(-1, 1)

In [None]:
# We chose the living space in sqm as an input and price as an output
# reshape(-1,1) reshapes the column in one-dimensional array (1 column) and unknown number of rows (-1)
X_train = np.array(train_data['sqm']).reshape(-1,1)
y_train = np.array(train_data['price']).reshape(-1, 1)
X_test = np.array(test_data['sqm']).reshape(-1,1)
y_test = np.array(test_data['price']).reshape(-1, 1)

In [None]:
# polynimial regressi0n transforms the features of X from [x_1, x_2]
# to [1, x_1, x_2, x_1^2, x_1 x_2, x_2^2] 
# so now they can be used in a linear regression model
# features2 = ['sqm','waterfront','yr_built']
features2 = 'sqm'
train_data_feature = train_data[features2].values.reshape(-1,1)
test_data_feature = test_data[features2].values.reshape(-1,1)
#y_train_data_reshaped = y_train.values.reshape(-1,1)
# train_data['price']

In [None]:
from sklearn.preprocessing import PolynomialFeatures
def polynomial_regression_model(degree):
    
  poly_features = PolynomialFeatures(degree=degree)
  
  # transforms the existing features to higher degree features.
  X_train_poly = poly_features.fit_transform(X_train)
  X_test_poly =  poly_features.fit_transform(X_test)
  
  # fit the transformed features to Linear Regression
  model = linear_model.LinearRegression()
  poly_model = model.fit(X_train_poly, y_train)
  
  # predicting on training data-set
  y_train_predicted = poly_model.predict(X_train_poly)
  
  # predicting on test data-set
  y_test_predicted = poly_model.predict(X_test_poly)

In [None]:
# execute
polynomial_regression_model(2)

In [None]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(train_data_feature)
X_test_poly = poly.fit_transform(test_data_feature)
y_train_poly = np.array(train_data['price']).reshape(-1, 1)
y_test_poly = np.array(test_data['price']).reshape(-1, 1)
#poly.fit(X_train_poly, train_data['price'].values.reshape(-1,1))
# poly.fit(X_train_poly, y_train)

In [None]:
# use linear regression
model = linear_model.LinearRegression()
model.fit(X_train_poly, y_train)

In [None]:
# linear prediction with poly fit data
y_train_predicted = model.predict(X_train_poly)
y_test_predicted = model.predict(X_test_poly)  

In [None]:
# y_train_predicted
y_test_predicted

In [None]:
# RMSE (Root mean squared error) answers the question: "How similar, on average, are the numbers in list1 to list2?"  
# evaluating the model on training dataset
rmse_train = np.sqrt(sm.mean_squared_error(y_train, y_train_predicted))
r2_train = sm.r2_score(y_train, y_train_predicted)
  
# evaluating the model on test dataset
rmse_test = np.sqrt(sm.mean_squared_error(y_test, y_test_predicted))
r2_test = sm.r2_score(y_test, y_test_predicted)

In [None]:
# print the measures - positive?
print('RMSE (training) ',rmse_train)
print('RMSE (testing) ',rmse_test)

In [None]:
# Compute other performance metrics
print("Multiple Regression Performance")

# The coefficients
print('Coefficient: ', model.coef_)
print('Intercept: ', model.intercept_)

# Explained variance score: 1 is perfect prediction
print('R-squared (training) ', round(r2_train, 3))
print('R-squared (testing) ', round(r2_test, 3))

In [None]:
# compare 
print('Explained variance score ', round(sm.explained_variance_score(y_train, y_train_predicted), 3))
print('Explained variance score ', round(sm.explained_variance_score(y_test, y_test_predicted), 3))

In [None]:
# Plot
plt.title('Polynomial Regression') 
plt.xlabel('feature')
plt.ylabel('price')
plt.scatter(test_data_feature, y_test,  color = 'green', label = 'test data') 
  
#plt.plot(X_plot_poly,model.predict(X_plot_poly),'-r')
plt.plot(test_data_feature, y_test_poly, '-r', color='red', linewidth=1)
plt.plot(test_data_feature, y_test_predicted, '-r', color='red', linewidth=1)
plt.show()

## Reference
https://www.kaggle.com/aniketg11/predicting-house-prices-through-regression/notebook <br>
https://www.dataquest.io/blog/understanding-regression-error-metrics/ <br>