# <font color=dimgray>Chapter 2 ~ Housing Prices
_This notebook contains some sample code from Chapter 2._

# <font color=dimgray>Part 1 ~ Explore Data</font>

## <font color=blue>Import needed Modules</font>

In [None]:
##!pip install scikit-learn-intelex  #install patches to speed up processing (only need to run this ONCE on your computer or Colab session)
#from sklearnex import patch_sklearn #import the patches
#patch_sklearn()

In [None]:
#general libraries needed
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#special graphing modules used
from pandas.plotting import scatter_matrix

#scikit learn imports
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

#not necessary but helps to visualize pipelines and models
from sklearn import set_config
set_config(display='diagram')

## <font color=blue>Function Definitions</font>

In [None]:
#function to verify the existence of a file in the current working directory and download it if not
import os,urllib, urllib.request, sys, tarfile
def downloadDataResource(file,sourcePath,compressed=None):
    if not os.path.isfile(file):
        try:
            urllib.request.urlretrieve(sourcePath+(compressed if compressed else file),(compressed if compressed else file))
            print("Downloaded", (compressed if compressed else file) )
            if compressed:
                ucomp = tarfile.open(compressed)
                ucomp.extractall()
                ucomp.close()
                print("File uncompressed.")
        except:
            print("ERROR: File", (compressed if compressed else file), "not found. Data source missing.")
    else:
        print("Data resource", file, "already downloaded.")

## <font color=blue>Source Data</font>

In [None]:
path = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/'
compressedfile = "housing.tgz"
filename = 'housing.csv'

#download data files if not currently downloaded into the current working directory
downloadDataResource(filename, path, compressedfile)

#create the dataframe
housing = pd.read_csv(filename)

##  <font color=blue>Analyze the Dataset</font>

In [None]:
#each row of the data represents a district
housing.head()

In [None]:
#review attribute data types
#Note that all attributes are numeric except for the last one that is text/string
housing.info()

In [None]:
#perform statistical analysis on numeric attributes
housing.describe()

### <font color=blue>Look for Correlations in the Data

In [None]:
housing.corr( numeric_only = True ) #addition of numeric_only helps to remove warning

### Explore what is Correlated with Predicted Value (Median House Value)

In [None]:
#from the graph above, can see the correlation of housing price and location (latitude/longitude)
#let's look at it statistically by analyzing the correlation of median_house_value to all other attributes
corr_matrix = housing.corr( numeric_only = True )
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
#there is a linear relationship as home values increase as incomes increase (no surprise)

#another great way to look for more than just linear relationships is to create a scatter matrix plot
scatter_matrix(housing[['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']], figsize=(12,8))
plt.show()

In [None]:
#this of course continues to show the linear relationship between median income and median house value
#explore this relationship more specifically ...
housing.plot(kind="scatter", x="median_income", y="median_house_value",alpha=.1)
plt.show()

#note the price cap at 500,000

<font color=red>Because Median Income is so highly correlated with our predicted variable, we will want to see that our model is stratified to ensure a well-balanced training and test data set. See below!</font>

### <font color=blue>Explore Data Distribution</font>

In [None]:
#graph using matplotlib.pyplot all numeric attributes of housing
#a histogram shows data distribution 
housing.hist(bins=50, figsize=(20,15))
plt.show()

### Evaluate attribute <i>Medium Income</i> and consider ways to normalize distribution

In [None]:
housing["median_income"].hist()
plt.show()

In [None]:
#create a new attribute that bins (pandas qcut function) Median Income into 5 different bins with equal distribution
housing["income_cat"] = pd.qcut(housing["median_income"], q=5, labels=[1, 2, 3, 4, 5])

#see that this column has a equal distribution of income data
housing["income_cat"].hist()
plt.show()

In [None]:
#create a new attribute that cuts (pandas qcut function) Median Income into 5 different bins by defined quartiles
housing["income_cat"] = pd.qcut(housing["median_income"], q=[0, .15, .3, .5, .8,  1], labels=[1,2,3,4,5])
#NOTE that these quartile values are a bit odd (inconsistent to help demonstrate a point) - a typical cut would be [0,.25,.5,.75,1]

#see that this column has a distribution based on defined quartiles 
housing["income_cat"].hist()
plt.show()

## Prepare Data

### Bin Median Income

In [None]:
#create a new attribute that cuts (pandas cut function) Median Income into 5 different bins by defining the bin values
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

#see that this column has a normal distribution of income data
housing["income_cat"].hist()
plt.show()

### <font color=blue>Analyze the Data some more</blue>

In [None]:
#plot the training data set using the pandas plot wrapper
#Note: by making alpha less than 1, the graph is more transparent
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=.1)
plt.show()

In [None]:
#add population to the graph - population sets the size (s) of the dots
#add median house value as the color of the dots using a predefined color mapping called jet
#sharex turns off sharing of the x axis
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, 
    sharex=False)
plt.show()

#### Population is Highly Correlated

In [None]:
#from the graph above, can see the correlation of housing price and location (latitude/longitude)
#let's look at it statistically by analyzing the correlation of median_house_value to all other attributes
housing.corr(numeric_only = True)["population"].sort_values(ascending=False)

### <font color=blue>Bring Context to Data

In [None]:
#look at ratio of rooms and population per household
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["population_per_household"]=housing["population"]/housing["households"]

#also look at bedrooms in relation to number of rooms
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]

## <font color=blue>Cleanse the Data</font>

### Explore Missing Values


In [None]:
#find attributes with missing values
housing.info()

In [None]:
#for the training set, only total_bedrooms is missing values
housing[ housing['total_bedrooms'].isna() ].head()

<font color=Red>NOTE that one option here is to drop all records/rows that have NaN values using <font color=black>housing.dropna(inplace=True)</font>. In doing so, however, there is data loss that may (or may not) lead to bias.</font>

In [None]:
#to preserver rows with NaN, we can use SimpleImputer to fill in missing values with the median value
imputer = SimpleImputer(strategy="median")

In [None]:
#imputers only work on numeric attributes so we must drop ocean proximity column
#note that we are creating a new dataframe here called housing_num
housing_num = housing.drop(columns=['ocean_proximity'])

#now fit the imputer to the dataset
imputer.fit(housing_num)

#see the results
imputer.statistics_

In [None]:
#apply imputation to housing_num
X=imputer.transform(housing_num)

#add this tranformation back into a dataframe
housing_tr=pd.DataFrame(X, columns = housing_num.columns, index = housing_num.index)

In [None]:
#the new training data set has no missing values
housing_tr.info()

### Convert Categorical Information into Numeric

In [None]:
#analyze the categorical attribute ocean_proximity and count how many rows (i.e. districts) belong to each category
housing["ocean_proximity"].value_counts()

In [None]:
#look at the top 10 rows in the dataframe 
housing["ocean_proximity"].head(10)

#### Create categorical information using Ordinal Encoder

In [None]:
#create an ordinal encorder object and fit the attribte housing_cat to it
#if you do not define categories, the ordinality will be defined for you
#in this case, we created the order where ISLAND should be 0, NEAR OCEAN 1 ...
proximity_ord = ['ISLAND', 'NEAR OCEAN', 'NEAR BAY', '<1H OCEAN', 'INLAND']
ordinal_encoder = OrdinalEncoder(categories=[proximity_ord])
housing_cat_encoded = ordinal_encoder.fit_transform(housing[['ocean_proximity']])
#Note: use of [[ ]] creates a dataframe from the single attribute ocean_proximity

#the result is an array where categorical values are given to each unique value in the attribute
housing_cat_encoded[:10]

In [None]:
#here is the list of categories
ordinal_encoder.categories_

In [None]:
# add this array as a new column to the dataframe
housing['ocean_prox_ordinal'] = housing_cat_encoded
housing.head(10)

#### Create categorical information using One Hot Encoder

In [None]:
#create a categoriacl encorder object and then fit & transform the data from the ocean proximity attribute
cat_encoder=OneHotEncoder()
housing_cat_1hot= cat_encoder.fit_transform(housing[['ocean_proximity']])

In [None]:
#here is the list of categories that were created
cat_encoder.categories_[0]

In [None]:
#here is what the result looks like in a dataframe
categorical_df = pd.DataFrame.sparse.from_spmatrix(housing_cat_1hot, columns=cat_encoder.categories_)
categorical_df.head(10)

## <font color=blue>Create a Training and Test Set </font>

### Traditional Train / Test Split Method

In [None]:
#while there are several ways to generate a test vs. training data set, 
#the easiest method is to use the Scikit-learn function train_test_split

#train_test_split if seeded with the same random state, will generate the same 2 sets everytime
#this is beneficial when trying to compare models

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
#train_test_split returns 2 different dataframes

In [None]:
print("Data set size: {:,}\nTraining set size: {:,} \nTest set size (20%): {:,}".format(len(housing),len(train_set),len(test_set)))

In [None]:
#look to see if the test and training sets are representative of the origial dataframe with regards to the income distribution
pd.DataFrame( { "Overall": housing["income_cat"].value_counts() / len(housing), 
               "Training": train_set["income_cat"].value_counts() / len(train_set),
               "Test": test_set["income_cat"].value_counts() / len(test_set)})

### Create a Stratified Sampling based on <i>Medium Income

In [None]:
#by adding the stratify feature to the train_test_split, can ensure a more representative collection of data
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, random_state=42, stratify=housing.income_cat)

In [None]:
print("Data set size: {:,}\nTraining set size: {:,} \nTest set size (20%): {:,}".format(
    len(housing),len(strat_train_set),len(strat_test_set)))

In [None]:
#see that the startified test set is MORE representative of the origial dataframe than before
#with regards to the income distribution
pd.DataFrame( { "Overall": housing["income_cat"].value_counts() / len(housing), 
               "Training": strat_train_set["income_cat"].value_counts() / len(strat_train_set),
               "Test": strat_test_set["income_cat"].value_counts() / len(strat_test_set)})

# Now that we have explored the data and learned ways to transform it, move on to Part 2 to see how a model is built.