In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv("housing.csv")
v = df.iloc[:, :-2]
v["ocean proximity"] = df["ocean_proximity"]
X = v.values
y = df.iloc[:, -2].values
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


About this file
1. longitude: A measure of how far west a house is; a higher value is farther west

2. latitude: A measure of how far north a house is; a higher value is farther north

3. housingMedianAge: Median age of a house within a block; a lower number is a newer building

4. totalRooms: Total number of rooms within a block

5. totalBedrooms: Total number of bedrooms within a block

6. population: Total number of people residing within a block

7. households: Total number of households, a group of people residing within a home unit, for a block

8. medianIncome: Median income for households within a block of houses (measured in tens of thousands of US Dollars)

9. medianHouseValue: Median house value for households within a block (measured in US Dollars)

10. oceanProximity: Location of the house w.r.t ocean/sea

In [24]:
print(X)

[[-122.23 37.88 41.0 ... 126.0 8.3252 'NEAR BAY']
 [-122.22 37.86 21.0 ... 1138.0 8.3014 'NEAR BAY']
 [-122.24 37.85 52.0 ... 177.0 7.2574 'NEAR BAY']
 ...
 [-121.22 39.43 17.0 ... 433.0 1.7 'INLAND']
 [-121.32 39.43 18.0 ... 349.0 1.8672 'INLAND']
 [-121.24 39.37 16.0 ... 530.0 2.3886 'INLAND']]


In [25]:
print(y)

[452600. 358500. 352100. ...  92300.  84700.  89400.]


In [26]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")
imputer.fit(X[:, :-1])
X[:, :-1] = imputer.transform(X[:, :-1])

In [27]:
print(X)

[[-122.23 37.88 41.0 ... 126.0 8.3252 'NEAR BAY']
 [-122.22 37.86 21.0 ... 1138.0 8.3014 'NEAR BAY']
 [-122.24 37.85 52.0 ... 177.0 7.2574 'NEAR BAY']
 ...
 [-121.22 39.43 17.0 ... 433.0 1.7 'INLAND']
 [-121.32 39.43 18.0 ... 349.0 1.8672 'INLAND']
 [-121.24 39.37 16.0 ... 530.0 2.3886 'INLAND']]


In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [29]:
print(X)

[[0.0 0.0 0.0 ... 322.0 126.0 8.3252]
 [0.0 0.0 0.0 ... 2401.0 1138.0 8.3014]
 [0.0 0.0 0.0 ... 496.0 177.0 7.2574]
 ...
 [0.0 1.0 0.0 ... 1007.0 433.0 1.7]
 [0.0 1.0 0.0 ... 741.0 349.0 1.8672]
 [0.0 1.0 0.0 ... 1387.0 530.0 2.3886]]


Multiple Linear Regression

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [31]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [32]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[216580.43 136900.  ]
 [287452.11 241300.  ]
 [179457.58 200700.  ]
 ...
 [ 89138.64 128600.  ]
 [255304.51 259500.  ]
 [213286.4  167600.  ]]


In [33]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6381617983930308

Polynomial Regression

In [34]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [35]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train1)
regressor = LinearRegression()
regressor.fit(X_poly, y_train1)

In [36]:
y_pred1 = regressor.predict(poly_reg.transform(X_test1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred1.reshape(len(y_pred1),1), y_test1.reshape(len(y_test1),1)),1))

[[131158.65 136900.  ]
 [268901.27 241300.  ]
 [154949.36 200700.  ]
 ...
 [130194.97 128600.  ]
 [274992.26 259500.  ]
 [174977.43 167600.  ]]


In [37]:
r2_score(y_test1, y_pred1)

-0.4378297481253566

Random Forest Regression

In [38]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [39]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train2, y_train2)

In [40]:
y_pred2 = regressor.predict(X_test2)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred2.reshape(len(y_pred2),1), y_test2.reshape(len(y_test2),1)),1))

[[148350. 136900.]
 [223320. 241300.]
 [146480. 200700.]
 ...
 [147190. 128600.]
 [216660. 259500.]
 [186650. 167600.]]


In [41]:
from sklearn.metrics import r2_score
r2_score(y_test2, y_pred2)

0.8000450493606437

support vector regression

In [43]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train3 = sc_X.fit_transform(X_train3)
y_train3 = sc_y.fit_transform(y_train3.reshape(-1, 1))


In [45]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train3, y_train3)

  y = column_or_1d(y, warn=True)


In [46]:
y_pred3 = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test3)).reshape(-1, 1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred3.reshape(len(y_pred3),1), y_test3.reshape(len(y_test3),1)),1))

[[131247.8  136900.  ]
 [260900.5  241300.  ]
 [150296.82 200700.  ]
 ...
 [121468.77 128600.  ]
 [260980.52 259500.  ]
 [188015.08 167600.  ]]


In [47]:
r2_score(y_test3, y_pred3)

0.7563026066784135