In [1]:
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('housing/housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12623,-121.53,38.48,5.0,27870.0,5027.0,11935.0,4855.0,4.8811,212200.0,INLAND
9804,-121.92,36.55,44.0,3494.0,635.0,693.0,415.0,3.6,452800.0,NEAR OCEAN
11900,-117.37,33.94,14.0,9286.0,1269.0,3565.0,1238.0,6.6635,219600.0,INLAND
3742,-118.39,34.19,36.0,904.0,191.0,627.0,191.0,2.4167,192900.0,<1H OCEAN
9114,-118.22,34.63,4.0,14348.0,2145.0,5839.0,1806.0,5.3799,222400.0,INLAND


In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [7]:
housing_data.shape

(19475, 10)

In [8]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [9]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [10]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
11475,-118.02,33.71,23.0,5554.0,995.0,2408.0,936.0,5.3886,331900.0,1,0,0,0,0
2446,-119.61,36.56,34.0,1911.0,497.0,1886.0,481.0,1.625,53000.0,0,1,0,0,0
15641,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,458300.0,0,0,0,1,0
13253,-117.68,34.11,16.0,3190.0,471.0,1414.0,464.0,5.5292,208600.0,0,1,0,0,0
9832,-121.93,36.63,33.0,1740.0,342.0,638.0,329.0,3.1912,319800.0,0,0,0,0,1


In [11]:
housing_data.shape

(19475, 14)

In [13]:
# For every neighborhood, we have a median housing value. This dataset is typically used for regression, but the
# same could be used for classification by calculating the median of housing prices.

median = housing_data['median_house_value'].median()

median

173800.0

In [14]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [15]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
20384,-118.94,34.18,24.0,3689.0,585.0,1898.0,581.0,5.9224,239400.0,1,0,0,0,0,True
9745,-121.7,36.67,37.0,641.0,129.0,458.0,142.0,3.3456,252600.0,1,0,0,0,0,True
4587,-118.27,34.05,37.0,350.0,245.0,1122.0,248.0,2.7634,137500.0,1,0,0,0,0,False
19667,-120.82,37.5,21.0,2974.0,495.0,1313.0,461.0,4.4886,135400.0,0,1,0,0,0,False
17613,-121.93,37.29,36.0,2241.0,437.0,989.0,442.0,3.9625,288200.0,1,0,0,0,0,True


In [16]:
# Now let's setup the features & Y values or labels for our classification model
# We create a dataframe by dropping the 'median_house_value' from the original dataset
# We form the Y based on the 'above_median' column we created in the previous step
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [17]:
# This is a binary classification problem
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [19]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [21]:
from sklearn.linear_model import LogisticRegression

# Importing the LogisticRegression estimator from the sklearn library
# The fit function kick starts the training process

# solver = 'liblinear' means - liblinear is a good choice for small datasets and binary classification
# It says the algorithm the solver should use under the hood to solve this logistic regression problem is 
# 'liblinear'

logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [22]:
print('Training_score : ', logistic_model.score(x_train, y_train))

Training_score :  0.820795892169448


In [23]:
# For classification problems the default score is determined by accuracy, the number of predictions that were
# correctly identified by the model

# Let's evaluate on test data

y_pred = logistic_model.predict(x_test)

In [24]:
# Before we take a look at how our predictions were, let's create a new dataframe with predicted vs. actual values

df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})

df_pred_actual.head(10)

Unnamed: 0,predicted,actual
19244,False,True
17830,True,True
18759,False,False
1981,True,True
12992,False,False
13314,False,False
8028,True,True
5157,False,False
10967,True,True
4086,True,True


In [25]:
from sklearn.metrics import accuracy_score

print('Testing_score : ', accuracy_score(y_test, y_pred))

Testing_score :  0.8156611039794609


In [None]:
# The accuracy score is pretty high for this model. This accuracy score here, means that the model's predictions 
# 81.5% of time correct