In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRFRegressor

In [19]:
models = {
    "LR" : LinearRegression(),
    "KNN" : KNeighborsRegressor(),
    "DT" : DecisionTreeRegressor(),
    "RF" : RandomForestRegressor(),
    "SVM" : SVR(),
    "XGBR" : XGBRFRegressor()
}

In [45]:
df = pd.read_csv("Assignment Datasets/House prices.csv")
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [46]:
df.shape

(4600, 18)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [48]:
df.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522,1970.786304,808.608261
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228,29.731848,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [49]:
df.loc[0]

date              2014-05-02 00:00:00
price                        313000.0
bedrooms                          3.0
bathrooms                         1.5
sqft_living                      1340
sqft_lot                         7912
floors                            1.5
waterfront                          0
view                                0
condition                           3
sqft_above                       1340
sqft_basement                       0
yr_built                         1955
yr_renovated                     2005
street           18810 Densmore Ave N
city                        Shoreline
statezip                     WA 98133
country                           USA
Name: 0, dtype: object

In [50]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [51]:
df.drop(["country", "statezip", "street", "date", "yr_built", "yr_renovated", "city"], axis=1, inplace=True)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800


In [52]:
x = df.drop(["price"], axis=1)
y = df["price"]

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [54]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[-0.44357657, -0.84565379, -0.54003388, ...,  0.80405687,
        -0.24007504, -0.67680932],
       [-0.44357657, -0.52676805,  0.00915808, ..., -0.67466841,
        -0.72215268,  1.38397752],
       [-0.44357657, -0.52676805, -0.45713698, ..., -0.67466841,
        -0.42372367, -0.15618949],
       ...,
       [-1.54429253, -1.48342526, -1.31719233, ..., -0.67466841,
        -1.10092797, -0.67680932],
       [ 0.6571394 ,  1.38654639,  2.37171974, ..., -0.67466841,
         2.98525395, -0.67680932],
       [-1.54429253, -0.84565379, -0.59184445, ...,  0.80405687,
        -0.58441621, -0.13449699]])

In [55]:
for name, model in models.items():
    print(f"Using : {name} Algorithm: ")
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*50,'\n')

Using : LR Algorithm: 
Train Accuracy: 0.17501840652574285
Test Accuracy: 0.5440705464893371
-------------------------------------------------- 

Using : KNN Algorithm: 
Train Accuracy: 0.4062189716922616
Test Accuracy: 0.15804293801256564
-------------------------------------------------- 

Using : DT Algorithm: 
Train Accuracy: 0.999998598675492
Test Accuracy: 0.011614624304492893
-------------------------------------------------- 

Using : RF Algorithm: 
Train Accuracy: 0.8855721008594598
Test Accuracy: 0.48231958982697754
-------------------------------------------------- 

Using : SVM Algorithm: 
Train Accuracy: -0.023997866643338917
Test Accuracy: -0.050635068198662925
-------------------------------------------------- 

Using : XGBR Algorithm: 
Train Accuracy: 0.7304274965039916
Test Accuracy: 0.4654905658049582
-------------------------------------------------- 

