In [None]:
#RMSE and MAE

#RMSE is sensitive to outliers hence not to be used when large number of outliers are present



![](images/Screenshot%20from%202022-10-11%2021-15-32.png)

![](images/Screenshot%20from%202022-10-11%2021-16-05.png)

In [None]:
import os, tarfile, urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
#fetch_housing_data()   No need to run again
#it only fetches the data

In [None]:
import pandas as pd

def housing_data_loader(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [None]:
housing = housing_data_loader()

In [None]:
housing.head(10)

In [None]:
#here each row represents each districts
#hence we have a dataset of 20640 districts
print(housing.shape)
housing.info()
#ocean_proximity is object Dtype means it can hold any type of data
#But since we loaded a csv file it is definitely text

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
#looking at other fields
housing.describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
housing.hist(bins = 50, figsize=(20,15))

plt.show()

#### Train test split

In [None]:
#we will define a function to do so

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * 0.2)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
len(train_set), len(test_set)

In [None]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
#But the housing dataset has no unique identifier column
#so we make one by resetting index

housing_with_id = housing.reset_index() #adds an index column

train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
housing_with_id #unique identifier by adding long and lat

#### Same using Sklearn

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, random_state=42, test_size=0.2)

In [None]:
#now use pd.cut() function to categorize the median income into several groups



housing['income_cat'] = pd.cut(housing['median_income'], bins=[0.,1.5,3.0,4.5,6.,np.inf], labels=[1,2,3,4,5])

In [None]:
housing.head()

In [None]:
housing['income_cat'].hist()
#now our housing data is stratified

In [None]:
#Now we are ready to do stratified sampling split
from cgi import test
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
#Let’s see if this worked as expected. You can start by looking at the income
#category proportions in the test set:

strat_test_set['income_cat'].value_counts()/len(strat_test_set)

In [None]:
train_index

In [None]:
housing.income_cat.value_counts()/len(housing) #strat_test_set income ratio is simillar to overall dataset income ratio

In [None]:
#Now check with random train test split
train, test = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
train.income_cat.value_counts()/len(train)

In [None]:
#Now remove the income_cat attr so the data is back to its original state

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

#### Discover and Visualize the Data to Gain Insights

In [None]:
#Visualizing geo data by plotting
import seaborn as sns

In [None]:
sns.scatterplot(data=housing, x="longitude", y='latitude', hue='income_cat', alpha=0.1)


In [None]:
plt.scatter(x=housing.longitude, y=housing.latitude, alpha=0.1)

In [None]:
housing.plot(kind='scatter', x = 'longitude', y='latitude', alpha = 0.3,s=housing['population']/100, label = "population", figsize= (10,7),c="median_house_value", cmap = plt.cm.jet, colorbar=True)
plt.legend()

In [None]:
housing.columns

In [None]:
z = housing['median_house_value']

In [None]:
import plotly.express as px
fig = px.scatter(housing,y='latitude', x='longitude',width=800, height=800,opacity=0.25, color='income_cat',size_max=0.01)
fig.show()

#### Looking for Correlations

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
#another way to check correlation
#using scatter_matrix

from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

In [None]:
#clearly the most primising attribute to predict the median house value is the median income


In [None]:
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.2)

#### Experimenting with Attribute Combinations

In [None]:
#Few less obvious straight lines are visible
#You might want to remove those districts so that the model doesnt learn form that data


In [None]:
#Creating new attributes as per the needs

housing['rooms_per_household'] = housing["total_rooms"]/housing["households"]

housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]

housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
#Now lets check the correlation again with the newly created columns

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=True)

#### Prepare the data of ML Algos

In [None]:
strat_test_set.info()

#### Separating the predictors and the labels

In [None]:
#drop copies the data

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

#### Data Cleaning

In [None]:
#Three options for missing value

#1 Get rid of the corresponding districts

#2 Get rid of the whole attribute

#3 Set the values to some value (zero, the mean, the median, etc)

##### `housing.drop("total_bedrooms", axis=1)` option 1
##### `housing.dropna(subset=["total_bedrooms"])` option 2
##### `median = housing["total_bedrooms"].median()` option 3
housing["total_bedrooms"].fillna(median, inplace=True)

#### Using Sklearn.impute

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [None]:
#while we cannot compute medians for text attribute
#hence we remove the ocean proximity column

housing_num = housing.drop('ocean_proximity', axis=1)

imputer.fit(housing_num)  #fitting the imputer data

In [None]:
print(imputer.statistics_)   #all computed medians


In [None]:
housing_num.median().values  #same

#### Transforming the data using the Imputer

In [None]:
X = imputer.transform(housing_num)
X   #creates a numpy array

In [59]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [62]:
housing_tr.info()  #no null values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 12655 to 19773
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB


#### Encoding categorical values to numerical values using encoder

In [86]:
#getting the text attribute of the housing dataset

housing_cat = housing[['ocean_proximity']]
housing_cat


#To categorize we will use sklearn.preprocessing

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
housing_cat_encoded = encoder.fit_transform(housing_cat)

In [91]:
encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [95]:
#in the above categories the category attrs are really close to each other
#but their value representation using the encoder varies them equally meaning every attribute is highly independent of each other

#so we will perform one hot encoding to create dummy variable for each of the categories

from sklearn.preprocessing import OneHotEncoder

hot_encoder = OneHotEncoder()
housing_cat_1hot = hot_encoder.fit_transform(housing_cat)
housing_cat_1hot   #saved as sparse matrix to save memory ie so that the zeroes dont take any space

#hence to convert it to useful format we can do

housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

#### Custom Transformers