# Linear Regression - Housing data

Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


Check out the data

In [None]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

USAhousing = pd.concat([ train, test ])

In [None]:
USAhousing.head()

In [None]:
USAhousing.describe()

In [None]:
USAhousing.info()
# sns.heatmap(combined.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(USAhousing.isnull().sum())

In [None]:
# sns.heatmap(USAhousing.isnull(),yticklabels=False,cbar=False,cmap='viridis')

plt.figure(figsize=(20, 20))

sns.heatmap(USAhousing.isnull(),
            square=False,
            annot=False,
            yticklabels=False,
            cbar=False,
            cmap='viridis'            
           )

plt.title('Features with missing values');

Correlation between variables:

In [None]:
# sns.heatmap(USAhousing.corr())

corr=USAhousing.corr()

plt.figure(figsize=(20, 20))

sns.heatmap(corr, 
            vmax=.8, 
            linewidths=0.01,
            square=True,
            annot=True,
            cmap='YlGnBu',
            linecolor="white")

plt.title('Correlation between features');

Train a linear model

Clean Data

In [None]:
# Clean Alley column
USAhousing.fillna(value={
    'Alley': 'NA', 
    'Fence': 'NA', 
    'FireplaceQu':'NA', 
    'MiscFeature':'NA',
    'PoolQC': 'NA',
    'BsmtCond': 'NA',
    'BsmtExposure': 'NA',
    'BsmtFinType1': 'NA',
    'BsmtFinType2': 'NA',
    'BsmtFinSF1': 0,
    'BsmtFinSF2': 0,
    'BsmtFullBath': 0,
    'BsmtHalfBath': 0,
    'BsmtUnfSF': 0,
    'TotalBsmtSF': 0,
    'BsmtQual': 'NA',
    'GarageCond': 'NA',
    'GarageFinish': 'NA',
    'GarageQual': 'NA',
    'GarageType': 'NA',
    'GarageYrBlt': USAhousing['YearBuilt'],
    'MasVnrArea': 0,
    'MasVnrType': 'None',
    'Electrical': 'SBrkr',
    'Functional': 'Typ',
    'GarageArea': 0,
    'GarageCars': 0,
    'MSZoning': 'RL',
    'Utilities': 'AllPub',
    'Exterior1st': 'VinylSd',
    'Exterior2nd': 'VinylSd',
    'KitchenQual': 'TA',
    'SaleType': 'WD'    
}, inplace=True)

# drop lotfrontage
USAhousing.drop('LotFrontage', axis=1, inplace=True)

Create dummy variables


In [None]:
objColumns = [col for col in list(USAhousing.columns) if USAhousing[col].dtypes == object]

dummies = pd.get_dummies(USAhousing, columns=objColumns, drop_first=True)

USAhousing = USAhousing.drop(objColumns, axis=1)

result = pd.concat([USAhousing, dummies], axis=1)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(result[:5])

In [None]:
train = USAhousing[USAhousing['SalePrice'].notnull()]

test = USAhousing[USAhousing['SalePrice'].isnull()]
test = test.drop('SalePrice', axis=1)

In [None]:
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']


Train Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

Create and train the model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train, y_train)

# Predictions

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test, predictions)

In [None]:
sns.distplot((y_test - predictions))

In [None]:
from sklearn import metrics

In [None]:
metrics.mean_absolute_error(y_test, predictions)

In [None]:
metrics.mean_squared_error(y_test, predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, predictions))

# Random Forest Regressor model


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfc = RandomForestRegressor(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
plt.scatter(y_test, rfc_pred)


In [None]:
sns.distplot((y_test - rfc_pred))

In [None]:
print 'MAE:  ', metrics.mean_absolute_error(y_test, rfc_pred)
print 'MSE:  ', metrics.mean_squared_error(y_test, rfc_pred)
print 'RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, rfc_pred))

# SVR model

In [None]:
from sklearn.svm import SVR

In [None]:
svr_rbf = SVR(kernel='linear', C=1e3)

In [None]:
svr_rbf.fit(X_train, y_train)
svr_rbf_pred = svr_rbf.predict(X_test)

In [None]:
plt.scatter(y_test, svr_rbf_pred)

In [None]:
sns.distplot((y_test - svr_rbf_pred))

In [None]:
print 'RSME: ', np.sqrt(metrics.mean_squared_error(y_test, svr_rbf_pred)) 