# House Price Prediction
* Name: Bhavik Jikadara

### Import libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read the CSV file
house_data = pd.read_csv('data.csv')

In [3]:
# head -> top five rows in datasets
house_data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [4]:
house_data.shape

(4600, 18)

In [5]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [6]:
house_data.describe

<bound method NDFrame.describe of                      date         price  bedrooms  bathrooms  sqft_living  \
0     2014-05-02 00:00:00  3.130000e+05       3.0       1.50         1340   
1     2014-05-02 00:00:00  2.384000e+06       5.0       2.50         3650   
2     2014-05-02 00:00:00  3.420000e+05       3.0       2.00         1930   
3     2014-05-02 00:00:00  4.200000e+05       3.0       2.25         2000   
4     2014-05-02 00:00:00  5.500000e+05       4.0       2.50         1940   
...                   ...           ...       ...        ...          ...   
4595  2014-07-09 00:00:00  3.081667e+05       3.0       1.75         1510   
4596  2014-07-09 00:00:00  5.343333e+05       3.0       2.50         1460   
4597  2014-07-09 00:00:00  4.169042e+05       3.0       2.50         3010   
4598  2014-07-10 00:00:00  2.034000e+05       4.0       2.00         2090   
4599  2014-07-10 00:00:00  2.206000e+05       3.0       2.50         1490   

      sqft_lot  floors  waterfront  view 

In [7]:
house_data.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

### Exploratory Data Analysis (EDA)

In [8]:
# sns.pairplot(house_data)

In [9]:
house_data_corr = house_data.corr()

In [10]:
plt.figure(figsize=(12,6))
sns.heatmap(house_data_corr, annot = True)

<AxesSubplot:>

### Training and Testing data

In [11]:
X = house_data[['bedrooms', 'bathrooms', 'floors', 'sqft_living', 'sqft_lot', 'condition', 'sqft_above']]
y = house_data['price']

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, test_size=0.2,random_state=42)

In [13]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
X_test

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,sqft_lot,condition,sqft_above
3683,3.0,2.50,2.0,1460,1613,3,1180
4411,5.0,2.25,1.0,2000,7900,4,1300
2584,3.0,3.25,3.0,2940,5432,4,2440
69,3.0,2.50,1.0,2200,7350,5,1570
1844,3.0,2.50,1.0,1720,8755,3,1000
...,...,...,...,...,...,...,...
1612,3.0,1.75,1.0,1700,8400,3,1460
1068,5.0,2.00,1.5,1930,6120,3,1930
4350,3.0,2.00,1.0,1180,7793,4,1180
3027,4.0,2.50,2.0,2370,10083,5,2370


In [14]:
# cross value score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def cross_value(model):
    predict = cross_val_score(model, X, y, cv=10)
    return predict.mean()


def print_evaluate(true, predicted):
    mean_abs_error = metrics.mean_absolute_error(true, predicted)
    mean_squ_error = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squ_error)
    r2_square = metrics.r2_score(true, predicted)
    print('MAE: ', mean_abs_error)
    print('MSE: ', mean_squ_error)
    print('RMSE: ', rmse)
    print('R2 Square: ', r2_square)
    print('__________________________________')

# return mae, mse, rmse, r2_square
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

### Preparing Data For Linear Regression

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

### Model Select : Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression(normalize=True)
lin_model.fit(X_train, y_train)

LinearRegression(normalize=True)

### Model Evaluation

In [17]:
# print the intercept and co
print('intercept: ', lin_model.intercept_)
coeff_df = pd.DataFrame(lin_model.coef_, X.columns, columns=['Coefficient'])
coeff_df

intercept:  544848.268832546


Unnamed: 0,Coefficient
bedrooms,-62066.635264
bathrooms,-9832.147772
floors,27749.007336
sqft_living,334423.514134
sqft_lot,-19384.112779
condition,38727.915216
sqft_above,-42237.6073


###  Predictions from our Model

In [18]:
pred = lin_model.predict(X_test)

In [19]:
sns.scatterplot(y_test, pred)



<AxesSubplot:xlabel='price'>

In [20]:
test_pred = lin_model.predict(X_test)
train_pred = lin_model.predict(X_train)

print('Test set evaluation:\n------------------------------------------')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n-------------------------------------------')
print_evaluate(y_train, train_pred)

Test set evaluation:
------------------------------------------
MAE:  219833.01885136354
MSE:  989737379734.7007
RMSE:  994855.4567044907
R2 Square:  0.029523036136685832
__________________________________
Train set evaluation:
-------------------------------------------
MAE:  170682.52362261032
MSE:  72089747651.23639
RMSE:  268495.34009221906
R2 Square:  0.4926326247021382
__________________________________


In [24]:
output = pd.DataFrame({'SalePrice': train_pred})
output.to_csv('submission.csv', index=False)