In [1]:
import numpy as np                                                                              # type: ignore
import pandas as pd                                                                             # type: ignore

from sklearn.model_selection import train_test_split                                            # type: ignore
from sklearn.metrics import mean_absolute_error , mean_squared_error , median_absolute_error    # type: ignore
from sklearn.impute import SimpleImputer                                                        # type: ignore
from sklearn.preprocessing import StandardScaler                                                # type: ignore
from sklearn.linear_model import SGDRegressor                                                   # type: ignore

dataset = pd.read_csv('../datasets/houses.csv')
dataset.head(20)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,price
0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1955.0,0.0,98178.0,47.5112,-122.257,1340.0,22.19
1,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,2170.0,400.0,1951.0,1991.0,98125.0,47.721,-122.319,1690.0,53.8
2,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,1933.0,0.0,98028.0,47.7379,-122.233,2720.0,18.0
3,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,1050.0,910.0,1965.0,0.0,98136.0,47.5208,-122.393,1360.0,60.4
4,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,1680.0,0.0,1987.0,0.0,98074.0,47.6168,-122.045,1800.0,51.0
5,4.0,4.5,5420.0,101930.0,1.0,0.0,0.0,3.0,11.0,3890.0,1530.0,2001.0,0.0,98053.0,47.6561,-122.005,4760.0,123.0
6,3.0,2.25,1715.0,6819.0,2.0,0.0,0.0,3.0,7.0,1715.0,0.0,1995.0,0.0,98003.0,47.3097,-122.327,2238.0,25.75
7,3.0,1.5,1060.0,9711.0,1.0,0.0,0.0,3.0,7.0,1060.0,0.0,1963.0,0.0,98198.0,47.4095,-122.315,1650.0,29.185
8,3.0,1.0,1780.0,7470.0,1.0,0.0,0.0,3.0,7.0,1050.0,730.0,1960.0,0.0,98146.0,47.5123,-122.337,1780.0,22.95
9,3.0,2.5,1890.0,6560.0,2.0,0.0,0.0,3.0,7.0,1890.0,0.0,2003.0,0.0,98038.0,47.3684,-122.031,2390.0,32.3


In [2]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(dataset)
dataset = imp.transform(dataset)

X = dataset[:, :-1]
y = dataset[:, -1]

print(f"X >>{X.shape}>> \n{X[:2]}\n")
print(f"y >>{y.shape}>> \n{y[:2]}")

X >>(1000, 17)>> 
[[ 3.00000e+00  1.00000e+00  1.18000e+03  5.65000e+03  1.00000e+00
   0.00000e+00  0.00000e+00  3.00000e+00  7.00000e+00  1.18000e+03
   0.00000e+00  1.95500e+03  0.00000e+00  9.81780e+04  4.75112e+01
  -1.22257e+02  1.34000e+03]
 [ 3.00000e+00  2.25000e+00  2.57000e+03  7.24200e+03  2.00000e+00
   0.00000e+00  0.00000e+00  3.00000e+00  7.00000e+00  2.17000e+03
   4.00000e+02  1.95100e+03  1.99100e+03  9.81250e+04  4.77210e+01
  -1.22319e+02  1.69000e+03]]

y >>(1000,)>> 
[22.19 53.8 ]


In [3]:
scaling = StandardScaler()
X = scaling.fit_transform(X)

# Splitting data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle =True)
print(f"X_train shape : {X_train.shape}")
print(f"X_test  shape : {X_test.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"y_test  shape : {y_test.shape}")

X_train shape : (670, 17)
X_test  shape : (330, 17)
y_train shape : (670,)
y_test  shape : (330,)


In [4]:
print(f"X_train : \n{X_train[:2]}\n")
print(f"y_train : {y_train[:2]}\n")
print(f"X_test  : \n{X_test[:2]}\n")
print(f"y_test  : {y_test[:2]}\n")

X_train : 
[[-0.4102682  -0.41010927 -0.5649788  -0.34544365 -0.86466394 -0.08989291
  -0.31023339  0.77745534 -0.52226623 -1.1273398   0.8629824  -0.71103123
  -0.20697177  0.77264317  0.89225716 -0.56370144 -0.15941113]
 [-0.4102682  -0.06349407 -0.09171927 -0.36416812  1.0699369  -0.08989291
   3.61284458  2.22918972  0.34012049 -0.60813955  0.8851764  -1.42099645
  -0.20697177  0.98219388  0.15542773 -1.20204137  0.60172335]]

y_train : [46. 55.]

X_test  : 
[[ 7.64109819e-01 -6.34940745e-02 -5.42442634e-01 -1.76439811e-01
  -8.64663940e-01 -8.98929064e-02 -3.10233393e-01 -6.74279028e-01
  -1.38465294e+00 -2.28236927e-01 -6.68403855e-01 -6.75532973e-01
  -2.06971765e-01  1.78229660e+00 -3.76334005e-01 -6.85631543e-01
  -1.59411132e-01]
 [-4.10268195e-01 -1.44995485e+00 -9.03021324e-01 -1.70428636e-01
  -8.64663940e-01 -8.98929064e-02 -3.10233393e-01  7.77455345e-01
  -5.22266227e-01 -6.33466391e-01 -6.68403855e-01 -1.06601384e-03
  -2.06971765e-01 -1.36096409e+00 -1.52329882e+00 -7

In [5]:
sgd = SGDRegressor( penalty = 'l2' , max_iter=1000, tol=1e-3 , loss = 'squared_error')
sgd.fit(X_train, y_train)

sgd.score(X_train,y_train)
sgd.score(X_test,y_test)
print(f'SGD Regression Train Score : {sgd.score(X_train, y_train)}')
print(f'SGD Regression Test  Score : {sgd.score(X_test, y_test)}\n')

## Calculating Prediction:
y_pred = sgd.predict(X_test)
print(f'Predicted Value : \n{ y_pred[:5]}')
print(f'Actual    Value : \n{ y_test[:5]}')

SGD Regression Train Score : 0.7389495946784967
SGD Regression Test  Score : 0.6908575557980555

Predicted Value : 
[27.11480996 19.76279958 65.35231433 23.44132424 27.70256387]
Actual    Value : 
[17.18  23.    77.5   24.5   27.995]


In [7]:
# Calculating Mean Absolute Error:
MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average')   # it can be raw_values
print(f'Mean   Absolute Error Value : {MAEValue}')
##########################################################
#Calculating Mean Squared Error
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')    # it can be raw_values
print(f'Mean   Squared  Error Value : {MSEValue}')
##########################################################
#Calculating Median Squared Error
MdSEValue = median_absolute_error(y_test, y_pred)
print(f'Median Squared  Error Value : {MdSEValue}')

Mean   Absolute Error Value : 12.695910849857981
Mean   Squared  Error Value : 376.4889113991007
Median Squared  Error Value : 8.93703253331697
