In [None]:
import sklearn
import pandas as pd

data = pd.read_csv('./sample_data/housing.csv')

mapping = {'NEAR BAY': 0, 'INLAND': 1, 'NEAR OCEAN': 2, 'ISLAND': 3, '<1H OCEAN': 4}

data['ocean_proximity'] = data['ocean_proximity'].replace(mapping)
data = data.fillna(data.mean())

# data = data.drop(columns=['total_rooms', 'total_bedrooms', 'population', 'population'])

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_rest = train_test_split(data, test_size=0.3, shuffle=True)
data_valid, data_test = train_test_split(data_rest, test_size=0.5, shuffle=True)

data_train.shape, data_valid.shape, data_test.shape

((14448, 10), (3096, 10), (3096, 10))

In [None]:
y_train = data_train['median_house_value']
x_train = data_train.drop(columns=['median_house_value'])

x_train.shape, y_train.shape

((14448, 9), (14448,))

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

x_train

model.fit(x_train, y_train)

In [None]:
y_valid = data_valid['median_house_value']

y_pred = model.predict(data_valid.drop(columns=['median_house_value']))

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

root_mean_squared_error = np.sqrt(mean_squared_error(y_valid, y_pred))

print(root_mean_squared_error)

67557.49025576607


In [None]:
mean_model = np.full_like(y_valid, fill_value=y_train.mean())

np.sqrt(mean_squared_error(y_valid, mean_model))

113633.9145992542

In [None]:
from sklearn.metrics import r2_score

r2_score(y_valid, y_pred)

0.6464606075681094

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

pred_sum = np.zeros(y_valid.size)
iterations = 10

for i in range(iterations):
    model = LinearRegression()
    model.fit(x_train, y_train)
    y_pred = model.predict(data_valid.drop(columns=['median_house_value']))
    pred_sum += y_pred

pred_sum_mean = pred_sum / iterations

r2_score(y_valid, pred_sum_mean)

0.6464606075681094

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=500, random_state=42, max_features=3, max_depth=None)

rf_regressor.fit(x_train, y_train)

y_pred = rf_regressor.predict(data_valid.drop(columns=['median_house_value']))

root_mean_squared_error = np.sqrt(mean_squared_error(y_valid, y_pred))

r2_score(y_valid, y_pred)

0.8234805853642465

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn_regressor = KNeighborsRegressor(n_neighbors=1000)

knn_regressor.fit(x_train, y_train)

y_pred = knn_regressor.predict(data_valid.drop(columns=['median_house_value']))

root_mean_squared_error = np.sqrt(mean_squared_error(y_valid, y_pred))

r2_score(y_valid, y_pred)

0.18706780058255246

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree_regressor = DecisionTreeRegressor(max_depth=9, random_state=42)

decision_tree_regressor.fit(x_train, y_train)

y_pred = decision_tree_regressor.predict(data_valid.drop(columns=['median_house_value']))

root_mean_squared_error = np.sqrt(mean_squared_error(y_valid, y_pred))

r2_score(y_valid, y_pred)

0.7016370215738841