# 2.5. Prepare the Data for Machine Learning Algorithms

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Get the data
housing = pd.read_csv("housing.csv")

# Use numerical attributes
housing = housing.drop('ocean_proximity', axis=1)

# Add attribute combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [3]:
# Get rid of the missing districts and separate the predictors and the labels
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=77)

train_set = train_set.dropna()

housing = train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = train_set["median_house_value"].copy()

In [4]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
count,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0,16356.0
mean,-119.573795,35.639796,28.631756,2631.18617,536.586023,1419.991807,498.207447,3.866145,5.430706,0.213129,3.090379
std,2.008811,2.13821,12.596261,2185.318541,419.035109,1111.483289,379.745509,1.897953,2.573172,0.057546,11.625191
min,-124.35,32.55,1.0,2.0,1.0,3.0,1.0,0.4999,0.846154,0.1,0.692308
25%,-121.81,33.93,18.0,1449.0,296.0,786.0,280.0,2.56,4.433867,0.175552,2.428532
50%,-118.49,34.26,29.0,2122.0,434.0,1162.0,408.0,3.5288,5.226528,0.203345,2.814815
75%,-118.01,37.72,37.0,3133.25,645.0,1718.0,602.0,4.7404,6.048902,0.240237,3.281487
max,-114.31,41.95,52.0,39320.0,6445.0,28566.0,6082.0,15.0001,141.909091,1.0,1243.333333


# 2.6. Select and Train a Model

## 2.6.1. Training and Evaluating on the Training Set

In [5]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing, housing_labels)

LinearRegression()

In [6]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68218.27774882574

In [7]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing, housing_labels)

DecisionTreeRegressor(random_state=42)

In [8]:
housing_predictions = tree_reg.predict(housing)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

## 2.7.5. Evaluate Your System on the Test Set

In [9]:
test_set = test_set.dropna()

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

In [10]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing, housing_labels)

LinearRegression()

In [11]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(X_test)
lin_mse = mean_squared_error(y_test, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

70451.26518090801

In [12]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing, housing_labels) # .fit(x,y)

DecisionTreeRegressor(random_state=42)

In [13]:
housing_predictions = tree_reg.predict(X_test)
tree_mse = mean_squared_error(y_test, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

72337.8117519875

In [14]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing, housing_labels)

RandomForestRegressor(random_state=42)

In [16]:
housing_predictions = forest_reg.predict(X_test)
forest_mse = mean_squared_error(y_test, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse
#this model is the best among 3

52177.862347226444