In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

In [2]:
from sklearn.preprocessing import MinMaxScaler

## PREPRARING

In [3]:
df = pd.read_csv("./kc_house_data.csv")

In [4]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [5]:
df.drop(['id','date','sqft_above'] , axis='columns' , inplace=True)

In [6]:
df['bathrooms'] = np.round(df['bathrooms'])
df['floors'] = np.round(df['floors'])

In [7]:
df.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [8]:
df.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [9]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.0,2570,7242,2.0,0,0,3,7,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,0,1987,0,98074,47.6168,-122.045,1800,7503


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
features = ['bedrooms','bathrooms','floors','sqft_living','zipcode','sqft_basement','waterfront']

In [12]:
x = df.loc[:,features]

In [13]:
y = df['price']

In [14]:
from sklearn.preprocessing import MinMaxScaler ,  StandardScaler

### Scaling Datasets with MinMaxScaler

In [15]:
min_max_scaler = MinMaxScaler()

In [16]:
x[features] = min_max_scaler.fit_transform(x[features])

In [17]:
x.head()

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
0,0.090909,0.125,0.0,0.06717,0.893939,0.0,0.0
1,0.090909,0.25,0.333333,0.172075,0.626263,0.082988,0.0
2,0.060606,0.125,0.0,0.036226,0.136364,0.0,0.0
3,0.121212,0.375,0.0,0.126038,0.681818,0.188797,0.0
4,0.090909,0.25,0.0,0.104906,0.368687,0.0,0.0


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## POLYNOMIAL REGRESSION

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
polyRegression = LinearRegression()

In [21]:
x_train.head()

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
13393,0.060606,0.125,0.0,0.064151,0.590909,0.0,0.0
17634,0.121212,0.125,0.333333,0.059623,0.515152,0.0,0.0
20385,0.090909,0.25,0.0,0.101887,0.186869,0.0,0.0
6613,0.121212,0.375,0.0,0.309132,0.459596,0.421162,0.0
15298,0.090909,0.375,0.0,0.086792,0.287879,0.082988,0.0


In [22]:
y_train.head()

13393    455000.0
17634    425000.0
20385    342000.0
6613     545000.0
15298    330490.0
Name: price, dtype: float64

In [23]:
from sklearn.preprocessing import PolynomialFeatures

In [24]:
poly = PolynomialFeatures(degree=6)

In [25]:
X_train_poly = poly.fit_transform(x_train)

In [26]:
x_train

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
13393,0.060606,0.125,0.000000,0.064151,0.590909,0.000000,0.0
17634,0.121212,0.125,0.333333,0.059623,0.515152,0.000000,0.0
20385,0.090909,0.250,0.000000,0.101887,0.186869,0.000000,0.0
6613,0.121212,0.375,0.000000,0.309132,0.459596,0.421162,0.0
15298,0.090909,0.375,0.000000,0.086792,0.287879,0.082988,0.0
...,...,...,...,...,...,...,...
13411,0.151515,0.625,0.333333,0.572830,0.888889,0.000000,0.0
6334,0.090909,0.250,0.000000,0.115472,0.540404,0.145228,0.0
4739,0.090909,0.250,0.333333,0.107170,0.146465,0.000000,0.0
19075,0.090909,0.125,0.000000,0.067170,0.010101,0.000000,0.0


In [27]:
X_train_poly

array([[1.        , 0.06060606, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.12121212, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.09090909, 0.25      , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.09090909, 0.25      , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.09090909, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.09090909, 0.25      , ..., 0.        , 0.        ,
        0.        ]])

In [28]:
X_test_poly = poly.fit_transform(x_test)

In [29]:
polyRegression.fit(X_train_poly,y_train)

LinearRegression()

In [30]:
y_pred = polyRegression.predict(X_test_poly)

In [31]:
X_test_poly

array([[1.        , 0.12121212, 0.25      , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.12121212, 0.375     , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.09090909, 0.25      , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.06060606, 0.125     , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.06060606, 0.25      , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.09090909, 0.375     , ..., 0.        , 0.        ,
        0.        ]])

In [32]:
y_pred

array([ 914623.5   , 1002126.5625,  444450.875 , ...,  400105.3125,
        319291.5   ,  571343.8125])

In [33]:
from sklearn.metrics import mean_squared_error , mean_absolute_error

In [34]:
mse = mean_squared_error(y_test , y_pred)
print('(Polynomial Regrssion) Mean Square Error Score : %s' % mse )

(Polynomial Regrssion) Mean Square Error Score : 1.2500116492821807e+26


In [35]:
score_poly = polyRegression.score(X_train_poly,y_train)
print("(Polynomial Regression) R Square score : %.2f" % score_poly)

(Polynomial Regression) R Square score : 0.74


## LINEAR REGRESSION SCALING

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [37]:
x_train

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
9817,0.090909,0.250,0.000000,0.084528,0.631313,0.000000,0.0
12455,0.151515,0.500,0.333333,0.279245,0.292929,0.000000,0.0
4060,0.121212,0.250,0.000000,0.108679,0.035354,0.176349,0.0
12798,0.121212,0.375,0.333333,0.172830,0.287879,0.000000,0.0
14838,0.151515,0.250,0.000000,0.132830,0.272727,0.109959,0.0
...,...,...,...,...,...,...,...
8238,0.090909,0.250,0.000000,0.146415,0.681818,0.201245,0.0
19158,0.090909,0.250,0.000000,0.166792,0.535354,0.151452,0.0
13351,0.090909,0.250,0.000000,0.092830,0.045455,0.000000,0.0
10067,0.090909,0.250,0.000000,0.116981,0.146465,0.082988,0.0


In [38]:
linear = LinearRegression()

In [39]:
linear.fit(x_train,y_train)

LinearRegression()

In [40]:
y_pred = linear.predict(x_test)

In [41]:
mse = mean_squared_error(y_test , y_pred)
print('(Linear Regrssion) Mean Square Error Score : %s' % mse )

(Linear Regrssion) Mean Square Error Score : 65168018025.87217


In [42]:
score_linear = linear.score(x_train,y_train)
print("(Linear Regression) R Square Score : %.2f" % score_linear)

(Linear Regression) R Square Score : 0.55


## LINEAR REGRESSION no scaling

In [43]:
x = df.loc[:,features]

In [44]:
y = df['price']

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [46]:
x_train.head()

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
1635,4,2.0,1.0,2020,98034,1010,0
12396,3,2.0,2.0,2370,98004,0,0
18589,3,2.0,2.0,2470,98052,0,0
1077,2,1.0,1.0,1090,98126,360,0
21458,5,3.0,1.0,2360,98103,970,0


In [47]:
y_train.head()

1635     495000.0
12396    943500.0
18589    732500.0
1077     356700.0
21458    504600.0
Name: price, dtype: float64

In [48]:
from sklearn.linear_model import LinearRegression

In [49]:
linear = LinearRegression()

In [50]:
x_train.head()

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
1635,4,2.0,1.0,2020,98034,1010,0
12396,3,2.0,2.0,2370,98004,0,0
18589,3,2.0,2.0,2470,98052,0,0
1077,2,1.0,1.0,1090,98126,360,0
21458,5,3.0,1.0,2360,98103,970,0


In [51]:
y_train.head()

1635     495000.0
12396    943500.0
18589    732500.0
1077     356700.0
21458    504600.0
Name: price, dtype: float64

In [52]:
linear.fit(x_train,y_train)

LinearRegression()

In [53]:
y_pred = linear.predict(x_test)

In [54]:
mse = mean_squared_error(y_test , y_pred)
print('(Linear Regrssion) Mean Square Error Score : %s' % mse )

(Linear Regrssion) Mean Square Error Score : 67060431189.23465


In [55]:
score_linear = linear.score(x_train,y_train)
print("(Linear Regression) R Square Score : %.2f" % score_linear)

(Linear Regression) R Square Score : 0.55


## API

In [56]:
import json

In [57]:
REQUEST = json.dumps({
'path' : {},
'args' : {}
})

In [58]:
# GET /home

print('home')

home


In [59]:
x_train = x_train.iloc[[0]]

In [60]:
x_train['bedrooms'] = 1

In [61]:
x_train

Unnamed: 0,bedrooms,bathrooms,floors,sqft_living,zipcode,sqft_basement,waterfront
1635,1,2.0,1.0,2020,98034,1010,0


In [62]:
# GET /calculatePrice/:bedrooms/:bathrooms/:floors/:sqft_living/:zipcode/:sqft_basement/:water
request = json.loads(REQUEST) 
bedrooms_var = request['path'].get('bedrooms') 
bathrooms_var = request['path'].get('bathrooms') 
floors_var = request['path'].get('floors') 
sqft_living_var = request['path'].get('sqft_living')
zipcode_var = request['path'].get('zipcode')
sqft_basement_var = request['path'].get('sqft_basement')
water = request['path'].get('water')

price = linear.predict(np.array([[bedrooms_var,bathrooms_var,floors_var,sqft_living_var,zipcode_var,sqft_basement_var,water]]))[0]

# print(np.array([[bedrooms_var,bathrooms_var,floors_var,sqft_living_var,zipcode_var,sqft_basement_var,water]]))
# print(bedrooms_var + bathrooms_var + floors_var + sqft_living_var + zipcode_var + sqft_basement_var + water + '')

print('{"price" : "%d"}' % price)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [63]:
import json

In [64]:
# GET /zipcode



zipcode = ['98001', '98002', '98003', '98004', '98005', '98006', '98007', '98008', '98009', '98010', '98011', '98013', '98014',
          '98015', '98019', '98022', '98023', '98024', '98025', '98027', '98028', '98029', '98030', '98031', '98035', '98038', 
          '98039', '98040', '98041', '98042', '98045', '98047', '98050', '98051', '98052', '98053', '98054', '90055', '98056', 
          '98057', '98058', '98059', '98062', '98063', '98064', '98065', '98068', '98070', '98071', '98072', '98073', '98074',
          '98075', '98083', '98092', '98093', '98101', '98102', '98103', '98104', '98105', '90106', '98107', '98108', '98109',
          '98111', '98112', '98114', '98115', '98116', '98117', '98118', '98119', '90121', '98122', '98124', '98125', '98126',
          '98131', '98132', '98133', '98134', '98136', '98138', '98144', '90145', '98146', '98148', '98154', '98155', '98158', 
          '98160', '98161', '98161', '98164', '98166', '98168', '98171', '90174', '98177', '98178', '98188', '98198', '98199', 
          '98224', '98288']


print('{"zipcode" : %s}' % json.dumps(zipcode))

{"zipcode" : ["98001", "98002", "98003", "98004", "98005", "98006", "98007", "98008", "98009", "98010", "98011", "98013", "98014", "98015", "98019", "98022", "98023", "98024", "98025", "98027", "98028", "98029", "98030", "98031", "98035", "98038", "98039", "98040", "98041", "98042", "98045", "98047", "98050", "98051", "98052", "98053", "98054", "90055", "98056", "98057", "98058", "98059", "98062", "98063", "98064", "98065", "98068", "98070", "98071", "98072", "98073", "98074", "98075", "98083", "98092", "98093", "98101", "98102", "98103", "98104", "98105", "90106", "98107", "98108", "98109", "98111", "98112", "98114", "98115", "98116", "98117", "98118", "98119", "90121", "98122", "98124", "98125", "98126", "98131", "98132", "98133", "98134", "98136", "98138", "98144", "90145", "98146", "98148", "98154", "98155", "98158", "98160", "98161", "98161", "98164", "98166", "98168", "98171", "90174", "98177", "98178", "98188", "98198", "98199", "98224", "98288"]}
