In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
data = pd.read_csv("housing.csv")

In [4]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
data.info() # to check for non-null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
data.dropna(inplace=True) # takes data, drop not a number values and saves result in data obj

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [86]:
# split data into training data and x and y data
# to train model on one set of data then train it on another

from sklearn.model_selection import train_test_split # imports train_test_split func to split datasets into training and testing subsets for ML tasks

x = data.drop(['median_house_value'], axis=1) # drop a column, axis specifies whether operation shud be done on rows (axis = 0) or columns (axis=1)
y = data['median_house_value']

In [87]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size= 0.2) # 20% of data reserved for evaluating

In [88]:
# test set is something you look at when you u have ur model and wanna evaluate on test data

# now we wanna join x training data and y training data

In [89]:
train_data = x_train.join(y_train)

In [90]:
%pylab
train_data.hist(figsize(15,8)) # get histogram of distribution of individual features

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)

In [91]:
# use heatmap to visualize a correlation matrix
plt.figure(figsize(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="crest")

# ^ in this heat map u can see that the median income correlates nicely w/ median house value 

<AxesSubplot:>

In [92]:
# Data preprocessing:
# We're going to take the logarithm of features that are left-leaning (technical term is right skewed)
# and see what the distribution looks like by using this:

train_data['total_rooms'] = np.log(train_data['total_rooms']+1) # add 1 to prevent 0 values
train_data['total_bedrooms'] = np.log(train_data['total_rooms']+1)  
train_data['population'] = np.log(train_data['total_rooms']+1)  
train_data['households'] = np.log(train_data['total_rooms']+1)  

# * using logarithm can help us improve performance of certain ML algorithms
# esp those that assume linear relationships between features and the target variable


In [93]:
train_data.hist(figsize(15,8))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)

In [94]:
# We want to use ocean proximity to guess ocean prices
# So we're turning the categorical features to ordinally encode them (turn them into binary) 

train_data.ocean_proximity.value_counts()

<1H OCEAN     7213
INLAND        5223
NEAR OCEAN    2076
NEAR BAY      1829
ISLAND           5
Name: ocean_proximity, dtype: int64

In [95]:
# we're creating a new feature for each of the ocean proximity values and turn them into 1 or 0 binary values
# if it's less than 1 hour = 1 
# if it's more than 1 hour = 0

train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12171,-117.15,33.70,2.0,8.749257,2.277191,2.277191,2.277191,3.1319,111500.0,0,1,0,0,0
2190,-120.06,36.72,32.0,6.889591,2.065544,2.065544,2.065544,1.8000,60400.0,0,1,0,0,0
17437,-120.45,34.64,17.0,7.112327,2.093385,2.093385,2.093385,3.2167,112500.0,0,0,0,0,1
9002,-118.34,33.99,48.0,7.067320,2.087821,2.087821,2.087821,3.8250,183000.0,1,0,0,0,0
20183,-119.16,34.28,30.0,6.025866,1.949598,1.949598,1.949598,4.0000,219200.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,-122.03,37.55,26.0,8.035279,2.201137,2.201137,2.201137,4.9118,217300.0,0,0,0,1,0
5952,-117.85,34.12,30.0,8.382061,2.238799,2.238799,2.238799,3.0448,192100.0,0,1,0,0,0
6038,-117.73,34.07,33.0,7.561122,2.147231,2.147231,2.147231,2.3406,122600.0,0,1,0,0,0
16147,-122.47,37.78,52.0,8.013674,2.198743,2.198743,2.198743,4.0208,414600.0,0,0,0,1,0


In [96]:
# here you can see a negative correlation between median house value will have negative correlation with inland
# aka: (if you're inland you pay less for your house)

plt.figure(figsize(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="crest")


<AxesSubplot:>

In [97]:
# visualize coordinations
plt.figure(figsize(15,8))
sns.scatterplot(x='latitude', y='longitude', data=train_data, hue="median_house_value", palette="coolwarm")

<AxesSubplot:xlabel='latitude', ylabel='longitude'>

In [98]:
# Feature Engineering

# how many bedrooms per room 
train_data['bedroom_ratio'] = train_data['total_bedrooms']/train_data['total_rooms']

# how many rooms per household
train_data['household_rooms'] = train_data['total_rooms']/train_data['households']

In [99]:
plt.figure(figsize(15,8))
sns.heatmap(train_data.corr(), annot=True, cmap="crest")
# ^ shows that bedroom ratio has a negative correlation with median house value per block


<AxesSubplot:>

In [101]:
# Train a linear regression
# A linear regression is a method to model the relationship between a dependent variable and one or more
# independent variables

from sklearn.linear_model import LinearRegression

# split into x and y and add new features to evaluate the model (i.e. bedroom ratio, household rooms)

x_train = train_data.drop(['median_house_value'], axis=1)
y_train = train_data['median_house_value']

reg = LinearRegression()
reg.fit(x_train, y_train) # we didnt scale the data tho here


ValueError: could not convert string to float: 'INLAND'

In [None]:
LinearRegression()

In [None]:
# This one we're just evaluating without hyper param tuning

test_data = x_test.join(y_test)

test_data['total_rooms'] = np.log(test_data['total_rooms']+1) # add 1 to prevent 0 values
test_data['total_bedrooms'] = np.log(test_data['total_rooms']+1)  
test_data['population'] = np.log(test_data['total_rooms']+1)  
test_data['households'] = np.log(test_data['total_rooms']+1)  

test_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

test_data['bedroom_ratio'] = train_data['total_bedrooms']/train_data['total_rooms']
test_data['household_rooms'] = train_data['total_rooms']/train_data['households']



In [103]:
test_data 

NameError: name 'test_data' is not defined