### **SELECT AND TRAIN THE MODEL**

+ You framed the Problem.

+ You Explored the Data.

+ You sampled a training set and test set.

+ You wrote Transformation Pipelines.

+ You Prepared the Data for ML algos.

In [14]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:

## fetch the data from the csv file.
housing = pd.read_csv("housing.csv")


## splitting the data to train set and test set using train test split of sklearn
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins = [0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                              labels = [1,2,3,4,5])

## stratified Splitting, important.
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
## making the data as earlier, remove "income_cat"
## to get the data as earlier.
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis = 1, inplace = True)
    


In [22]:

## drop the "mean_house_value" from the training set.
## tehn make a copy of it.
housing_data = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()



## initialize the simple imputer instance 
imputer = SimpleImputer(strategy="median")
## drop the non-numerical attributes for now.
housing_numeric_data = housing_data.drop("ocean_proximity", axis = 1)
## now fit the imputer instance using fit() method
imputer.fit(housing_numeric_data)
## here imputer now simply calculated the median,
## it will calculate the median for all the numerical attributes.
## and will apply imputing to all attributes.
print("Imputer Statistics :--",imputer.statistics_)  ## all values will be saved in the statistics.
print("Median Values which are Calculated :--",housing_numeric_data.median().values)

## now use the trained imputer to transform the training set by replacing missing values with the learned medians.
X = imputer.transform(housing_numeric_data)
X.shape  ## the result will be a numpy array.
## putting the plain Numpy Array to Pandas Dataframe
## name = housing_transformed_simpleimputer
housing_tr_SI = pd.DataFrame(X, columns = housing_numeric_data.columns, index = housing_numeric_data.index)
housing_tr_SI.head(5)

Imputer Statistics :-- [-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]
Median Values which are Calculated :-- [-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347


In [23]:
## ordinal encoding
housing_data_cat = housing_data[["ocean_proximity"]]

## initiate the new instance.
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_data_cat)
print("Vaues:--",housing_cat_encoded[:10])
print("Given Categories:--",ordinal_encoder.categories_)



Vaues:-- [[0.]
 [0.]
 [4.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]]
Given Categories:-- [array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]


In [24]:
## one hot encoding

cat_encoder = OneHotEncoder()
housing_data_cat_1hot = cat_encoder.fit_transform(housing_data_cat)
housing_data_cat_1hot[:10]
print(housing_data_cat_1hot.toarray())

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [25]:
## custom transformers

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    
    
    def __init__(self,add_bedrooms_per_room=True):  ## no *args or **kargs
        self.add_bedrooms_per_room=add_bedrooms_per_room
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,households_ix]
        population_per_household=X[:,population_ix]/X[:,households_ix]
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household,population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs=attr_adder.transform(housing_data.values)

In [26]:
## simple pipeline


numerical_pipeline = Pipeline([
    ("imputer" , SimpleImputer(strategy="median")),
    ("attribs_adder" , CombinedAttributesAdder()),
    ("std_scaler" , StandardScaler())
])

housing_data_num_transformed = numerical_pipeline.fit_transform(housing_numeric_data)

In [27]:


numerical_attribs = list(housing_numeric_data)
category_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("numeric_values", numerical_pipeline, numerical_attribs),
    ("categorical_values", OneHotEncoder(), category_attribs)
])
housing_prepared = full_pipeline.fit_transform(housing_data)

In [17]:
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [21]:
## from the previous cell,
## we have the data to train the model.
#print(housing_prepared)

**TRAINING AND EVALUATING ON THE TRAINING SET**

Training Linear Regression Model.

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()

lin_reg.fit(housing_prepared, housing_labels)

housing_predictions = lin_reg.predict(housing_prepared)

lin_mse = mean_squared_error(housing_labels, housing_predictions)

lin_rmse = np.sqrt(lin_mse)

In [29]:
lin_rmse

68628.19819848923

+ Most District's median house value is between `$120000` and `$2650000`

+ So the prediction `$68628` is not a good one.

+ This is an Example of Underfitting.

+ Model is Underfitting the Training Data.

+ This is because of the features do not provide enough Information to make the good Predictions.

+ OR Model is not Powerful enough.

+ To fix Underfitting, the methods are 

        + Select More Powerful Model.
        
        + Feed model with better features.
        
        + Reduce the Constraints on the Model.
        


+ This Model is not Regularized, So last One We can Ignore.   

Training Decision Tree Regressor.

+ This is a Powerful Model, Capable of finding Complex Nonlinear Relationships in the Data.

In [30]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

tree_reg.fit(housing_prepared, housing_labels)

## evaluate.

housing_predictions = tree_reg.predict(housing_prepared)

tree_mse = mean_squared_error(housing_labels, housing_predictions)

tree_rmse = np.sqrt(tree_mse)

print(tree_rmse)

0.0


+ Here it is showing no Error.

+ This is an Example of Overfitting the Data.

+ You don't want to touch the test set until you are ready to Launch the model.

+ So you need to Use a part of training set for training and part of it for Model Validation.

**CROSS VALIDATION METHOD**

+ Cross Validation is a method to Validate the model by splitting the training set to Different folds.

+ Here it will Randomly split the training set into different folds.

+ then it trains and evaluate the selected ML algorithm for the number of folds. eg 10 here.

+ The result is an Array containing 10 evaluating scores.

In [32]:
def display_scores(scores):
    
    print("Scores :--",scores)
    print("The Mean Value :--",scores.mean())
    print("The Std Devaition",scores.std())

In [40]:
## cross validation done for decision tree model.
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring = "neg_mean_squared_error",
                                 cv = 10)

tree_rmse_scores = np.sqrt(-tree_scores)

display_scores(tree_rmse_scores)

Scores :-- [70460.02017297 67729.55444058 72159.53178909 69051.91457446
 69527.11053348 73530.36354937 71457.72184771 69128.96105289
 76055.38672939 71325.81268787]
The Mean Value :-- 71042.63773777967
The Std Devaition 2324.9578790664114


+ Now the DecisionTreeModel is not looking great.

+ Cross Validation Gives two Metrics.

    + Validation Scores
    + Standard Deviation.
    
    
+ Cross Validation allows you to get not only the Estimate of Perfomance of your Model.

+ but also a measure of how precise this estimate.

Desicion Tree output = `71042 +- 2324`

In [41]:
## computing score for linear model.

lin_scores = cross_val_score(lin_reg,
                                housing_prepared,
                                housing_labels,
                                scoring = "neg_mean_squared_error",
                                cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores :-- [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
The Mean Value :-- 69052.46136345083
The Std Devaition 2731.6740017983484


Linear Regression Output = `69052 +- 2731`

+ The Decision tree model is Overfitting badly.

+ It performs worstly than Linear Regression Model.

Random Forest Regressor Model.

+ Random Forest works by training many Decision Trees on random subsets of features, then averaging out their predictions.

+ Building a model on top of other Models is called Ensemble Learning.

+ We will get better Perfomance by this Way.

In [42]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()

forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)

forest_mse = mean_squared_error(housing_labels, housing_predictions)

forest_rmse = np.sqrt(forest_mse)

forest_rmse

18671.50930417007

In [44]:
##definnig forest score using cross validation method.

forest_scores = cross_val_score(forest_reg,
                                   housing_prepared,
                                   housing_labels,
                                   scoring = "neg_mean_squared_error",
                                   cv = 10)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores :-- [49535.30665498 47427.08974674 50042.62400595 52404.14851748
 49570.51903801 53647.32101315 48573.99115511 47868.05724351
 53010.14366044 50150.82269695]
The Mean Value :-- 50223.00237323133
The Std Devaition 2031.50317120755


Random Forest output :-- `50223 +- 2031`


+ Random Forest looks very Promising.

+ Score on training set is still much lower than the validation set.

+ ie, the model is still overfitting the training set.

+ Possible Solutions for Overfitting problems:--

    + Simplify the Model.
    
    + Constraint the Model (Regularize the Model).
    
    + Get more training data.

Trying out more Models like.

+ Support Vector Machine :-- with Different Kernel.

+ Neural Network :-- without much tweaking.

Support Vector Machine Model.

In [46]:
## using the kernel = linear

from sklearn.svm import SVR

svm_reg = SVR(kernel = "linear")

svm_reg.fit(housing_prepared, housing_labels)

housing_predictions = svm_reg.predict(housing_prepared)

svm_mse = mean_squared_error(housing_labels, housing_predictions)

svm_rmse = np.sqrt(svm_mse)

print(svm_rmse)

111094.6308539982


In [47]:
svm_scores = cross_val_score(svm_reg,
                                housing_prepared,
                                housing_labels,
                                scoring = "neg_mean_squared_error",
                                cv = 10)

svm_rmse_scores = np.sqrt(-svm_scores)
display_scores(svm_rmse_scores)

Scores :-- [105342.09141998 112489.24624123 110092.35042753 113403.22892482
 110638.90119657 115675.8320024  110703.56887243 114476.89008206
 113756.17971227 111520.1120808 ]
The Mean Value :-- 111809.84009600841
The Std Devaition 2762.393664321567


SVM output :-- `111809 +- 2762`

+ The kernel used here is "linear".

In [49]:
## using the kernel = "rbf"

## using the kernel = linear

from sklearn.svm import SVR

svm_reg_rbf = SVR(kernel = "rbf")

svm_reg_rbf.fit(housing_prepared, housing_labels)

housing_predictions = svm_reg_rbf.predict(housing_prepared)

svm_mse_rbf = mean_squared_error(housing_labels, housing_predictions)

svm_rmse_rbf = np.sqrt(svm_mse_rbf)

print(svm_rmse_rbf)

118580.68301157995


In [51]:
svm_rbf_scores = cross_val_score(svm_reg_rbf,
                                housing_prepared,
                                housing_labels,
                                scoring = "neg_mean_squared_error",
                                cv = 10)

svm_rmse_rbf_scores = np.sqrt(-svm_rbf_scores)
display_scores(svm_rmse_rbf_scores)

Scores :-- [111389.0681902  119541.25938571 116957.62830414 120447.19932481
 117618.15904234 122309.10351544 117634.40230741 121469.713921
 120343.01369623 118017.12860651]
The Mean Value :-- 118572.66762937943
The Std Devaition 2936.8775867949425


SVM Output :-- `118572 +- 2936`

+ The kernel used here is "rbf".

**Trained 4 Models.**


+ Linear Regression :-- Output = `69052 +- 2731`

+ Decision Tree Regression :-- Output =  `71042 +- 2324`

+ Random Forest Regression :-- Output = `50223 +- 2031`

+ Support Vector Machine :-- Output = `111809 +- 2762` with kernel `linear`

+ Support Vector Machine :-- Output = `118572 +- 2936` with kernel `rbf`

Here the best model is SVM with kernel rbf.

+ Because it has an Output almost the average of housing price in the City.

**SAVE THE MODEL**

In [52]:
import joblib

joblib.dump(lin_reg, "linear_Regression.pkl")
my_model_loaded = joblib.load("linear_Regression.pkl")

In [53]:
joblib.dump(tree_reg, "DecisionTree_Regression.pkl")
my_model_loaded = joblib.load("DecisionTree_Regression.pkl")

In [54]:
joblib.dump(forest_reg, "RandomForest.pkl")
my_model_loaded = joblib.load("RandomForest.pkl")

In [None]:
joblib.dump(svm_reg, "SVM_linear.pkl")
my_model_loaded = joblib.load("SV")