In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/housing.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
data["median_income"].describe()

count    20640.000000
mean         3.870671
std          1.899822
min          0.499900
25%          2.563400
50%          3.534800
75%          4.743250
max         15.000100
Name: median_income, dtype: float64

In [4]:
import numpy as np

bins = [0.0, 1.5, 3.0, 4.5, 6.0, np.inf]
data["income_cat"] = pd.cut(data["median_income"], bins, labels=[1, 2, 3, 4, 5])

In [5]:
data[["median_income", "income_cat"]].sample(10)

Unnamed: 0,median_income,income_cat
13574,3.125,3
10314,6.057,5
11215,4.8981,4
7692,6.7501,5
16786,5.5845,4
9774,2.3,2
19032,5.9441,4
10866,5.4836,4
891,6.2179,5
10135,4.9766,4


## Train,Test set and Pipeline Creation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn import set_config
set_config(display='diagram')

In [7]:
# train_set, test_set = train_test_split(data, test_size=0.2, random_state=12312)

strat_train_set, strat_test_set = train_test_split(
    data, test_size=0.2, random_state=12312, stratify=data["income_cat"]
)

In [8]:
housing = strat_train_set.drop(['median_house_value', 'income_cat'], axis=1)    #housing(X)
housing_labels = strat_train_set['median_house_value']    #housing(Y)

test_set = strat_test_set.drop(['median_house_value', 'income_cat'], axis=1)
test_labels = strat_test_set['median_house_value']

In [9]:
# In Houshing Train_set deviding into number and catagorigal attributes

num_attributes = housing.select_dtypes(include=[np.number]).columns.tolist()
cat_attributes = housing.select_dtypes(include=['object']).columns.tolist()

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

num_pipeline

In [11]:
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import OrdinalEncoder

cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder())
])

cat_pipeline

In [12]:
from sklearn.compose import ColumnTransformer

preprocessing_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

preprocessing_pipeline

## Model Creation
- Lin Regression
- Tree
- Random Forest

In [13]:
# Linear Regression

from sklearn.linear_model import LinearRegression

lin_reg_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('lin_reg', LinearRegression())
])

lin_reg_pipeline

In [14]:
#Decision Tree 

from sklearn.tree import DecisionTreeRegressor

tree_reg_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('tree_reg', DecisionTreeRegressor())
])

tree_reg_pipeline

In [15]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor

forest_reg_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('forest_reg', RandomForestRegressor())
])

forest_reg_pipeline

In [16]:
# Model fit

lin_model = lin_reg_pipeline.fit(housing, housing_labels)
tree_model = tree_reg_pipeline.fit(housing, housing_labels)
random_forest_model = forest_reg_pipeline.fit(housing, housing_labels)

In [17]:
lin_predicted_labels = lin_model.predict(test_set)
tree_predicted_labels = tree_model.predict(test_set)
random_forest_predicted_labels = random_forest_model.predict(test_set)

In [18]:
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(test_labels, lin_predicted_labels)
tree_rmse = root_mean_squared_error(test_labels, tree_predicted_labels)
random_forest_rmse = root_mean_squared_error(test_labels, random_forest_predicted_labels)

print(f'lin_rmse: {lin_rmse}')
print(f'tree_rmse: {tree_rmse}')
print(f'random_forest_rmse: {random_forest_rmse}')

lin_rmse: 68267.52671689572
tree_rmse: 66600.37657564289
random_forest_rmse: 47787.8472150164


In [19]:
from sklearn.metrics import r2_score

lin_r2_score = r2_score(test_labels, lin_predicted_labels)
tree_r2_score = r2_score(test_labels, tree_predicted_labels)
random_forest_r2_score = r2_score(test_labels, random_forest_predicted_labels)

print(f'lin_r2_score: {lin_r2_score}')
print(f'tree_r2_score: {tree_r2_score}')
print(f'random_forest_r2_score: {random_forest_r2_score}')

lin_r2_score: 0.635645408412036
tree_r2_score: 0.6532238037831832
random_forest_r2_score: 0.8214619274327363


# Save the best model

In [20]:
import joblib

joblib.dump(random_forest_model, '../model/random_forest_model.pkl')

['../model/random_forest_model.pkl']