## No visualising data in this code. Just an end to end script

In [25]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

### Reading the Data

In [2]:
housing_data = pd.read_csv(f"{os.getcwd()}{os.sep}housing.csv")

housing_data["median_income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in strat_split.split(housing_data, housing_data["median_income_cat"]):
    train_data_set  = housing_data.loc[train_idx]
    test_data_set   = housing_data.loc[test_idx]

for _ in (train_data_set, test_data_set):
    _.drop('median_income_cat', axis=1, inplace=True)

In [12]:
X_train = train_data_set.drop(["median_house_value"], axis=1)
y_train = train_data_set["median_house_value"].copy()


In [16]:
rooms_idx, bedrooms_idx, population_idx, household_idx = 3, 4, 5, 6


class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_idx] / X[:, household_idx]
        population_per_household = X[:, population_idx] / X[:, household_idx]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
            return np.c_[
                X, rooms_per_household, population_per_household, bedrooms_per_room
            ]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [17]:
numerical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("attributes_adder", CombineAttributesAdder()),
        ("std_scaler", StandardScaler()),
    ]
)

numerical_attributes = X_train.drop(["ocean_proximity"], axis=1).columns.tolist()
catergorical_attributes = ["ocean_proximity"]

full_pipeline = ColumnTransformer(
    [
        ("numerical", numerical_pipeline, numerical_attributes),
        ("categorical", OneHotEncoder(), catergorical_attributes),
    ]
)



In [18]:
X_train_prepared = full_pipeline.fit_transform(X_train)

### Linear Regression

In [19]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

LinearRegression()

In [22]:
some_data = X_train.iloc[:5]
some_labels = y_train.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]
Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [26]:
housing_predictions = lin_reg.predict(X_train_prepared)
lin_mse = mean_squared_error(y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68627.87390018745

### Decision Tree Regressor

In [27]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train_prepared, y_train)

DecisionTreeRegressor()

In [28]:
housing_predictions = tree_reg.predict(X_train_prepared)
tree_mse = mean_squared_error(y_train, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

looks like zero error. but it means that the model has over fit the training set. Which is not good for our model, let's try using cross_val_score function, which will split the training data into ***k*** subsets and runs the model for ***k*** times. Then we can compare the socres

In [29]:
scores = cross_val_score(
    tree_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10
)
tree_rmse_scores = np.sqrt(-scores)


In [30]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)

Scores: [73553.24356399 70776.63314137 68620.34438505 70816.00217939
 68968.92164705 76459.10750583 71204.42474235 73563.21406627
 68633.5453335  71825.98907021]
Mean: 71442.14256350111
Standard deviation: 2390.230460089131


    Now we see that the error for this model is greater than that of Linear Regression, let's try the same for liner regression model and check

In [32]:
scores = cross_val_score(
    lin_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10
)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard deviation: 2880.3282098180657


    Compared to DecisionTreeRegressor, This is looking better. But still, a $69104 variation is not a good fit. Let's try a different model

### Random Forest Regressor

In [33]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train_prepared, y_train)

RandomForestRegressor()

In [34]:
housing_predictions = forest_reg.predict(X_train_prepared)
forest_mse = mean_squared_error(y_train, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18644.177904618315

In [35]:
scores = cross_val_score(
    forest_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10
)
forest_rmse_scores = np.sqrt(-scores)
display_scores(forest_rmse_scores)

Scores: [51180.50530925 48712.60499157 46985.71589931 52301.2630304
 47559.7530078  51957.81566893 52726.67468378 49547.1461688
 48447.15547374 53894.68245356]
Mean: 50331.33166871456
Standard deviation: 2265.8872138206484


    This is not the best, but this is much better! You can save this model using joblib module and use it for different data. And now we can check the output on test result

# Read up on Grid Search, you have not completed that Yet!