In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
data = pd.read_csv("housing.csv")

In [4]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
data.info() # to check for non-null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
data.dropna(inplace=True) # takes data, drop not a number values and saves result in data obj

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [9]:
# split data into training data and x and y data
# to train model on one set of data then train it on another

from sklearn.model_selection import train_test_split # imports train_test_split func to split datasets into training and testing subsets for ML tasks

x = data.drop(['median_house_value'], axis=1) # drop a column, axis specifies whether operation shud be done on rows (axis = 0) or columns (axis=1)
y = data['median_house_value']

In [18]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size= 0.2) # 20% of data reserved for evaluating

In [19]:
# test set is something you look at when you u have ur model and wanna evaluate on test data

# now we wanna join x training data and y training data

In [20]:
train_data = x_train.join(y_train)

In [32]:
%pylab
train_data.hist(figsize(15,8)) # get histogram of distribution of individual features

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)

In [34]:
# use heatmap to visualize a correlation matrix
plt.figure(figsize(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="crest")

# ^ in this heat map u can see that the median income correlates nicely w/ median house value 

<AxesSubplot:>

In [None]:
# Data preprocessing:
# We're going to take the logarithm of features that are left-leanign (technical term is right skewed)
# and see what the distribution looks like by using this:

train_data['total_rooms'] = np.log(train_data['total_rooms']+1) # add 1 to prevent 0 values
train_data['total_bedrooms'] = np.log(train_data['total_rooms']+1)  
train_data['population'] = np.log(train_data['total_rooms']+1)  
train_data['households'] = np.log(train_data['total_rooms']+1)  




In [35]:
train_data.hist(figsize(15,8))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)

In [36]:
# We want to use ocean proximity to guess ocean prices
# So we're turning the categorical features to ordinally encode them (turn them into binary) 

train_data.ocean_proximity.value_counts()

<1H OCEAN     7172
INLAND        5242
NEAR OCEAN    2108
NEAR BAY      1819
ISLAND           5
Name: ocean_proximity, dtype: int64

In [43]:
# we're creating a new feature for each of the ocean proximity values and turn them into 1 or 0 binary values
# if it's less than 1 hour = 1 
# if it's more than 1 hour = 0

train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
18382,-121.83,37.23,7.0,5389.0,903.0,2232.0,825.0,6.6659,<1H OCEAN,500001.0,1,0,0,0,0
5261,-118.53,34.09,37.0,5477.0,833.0,1925.0,757.0,8.1888,<1H OCEAN,500001.0,1,0,0,0,0
17327,-120.08,34.62,11.0,3478.0,588.0,1693.0,582.0,4.6554,NEAR OCEAN,272300.0,0,0,0,0,1
12191,-117.38,33.67,9.0,13288.0,2728.0,7235.0,2350.0,3.3750,<1H OCEAN,131800.0,1,0,0,0,0
15085,-116.92,32.82,34.0,1765.0,284.0,772.0,282.0,5.0118,<1H OCEAN,165300.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6649,-118.13,34.16,36.0,2162.0,658.0,1337.0,590.0,2.2095,<1H OCEAN,176700.0,1,0,0,0,0
13522,-117.41,34.23,17.0,889.0,131.0,439.0,141.0,6.1426,INLAND,155000.0,0,1,0,0,0
14284,-117.13,32.72,9.0,2436.0,720.0,1780.0,653.0,1.8299,NEAR OCEAN,137500.0,0,0,0,0,1
13900,-116.51,34.45,21.0,8502.0,2634.0,2330.0,991.0,1.3811,INLAND,51300.0,0,1,0,0,0


In [44]:
# here you can see a negative correlation between median house value will have negative correlation with inland
# aka: (if you're inland you pay less for your house)

plt.figure(figsize(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="crest")


<AxesSubplot:>

In [45]:
# visualize coordinations
plt.figure(figsize(15,8))
sns.scatterplot(x='latitude', y='longitude', data=train_data, hue="median_house_value", palette="coolwarm")

<AxesSubplot:xlabel='latitude', ylabel='longitude'>

In [None]:
# Feature Engineering

