In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [6]:
!wget 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

zsh:1: command not found: wget


In [5]:
df = pd.read_csv('housing.csv')

In [13]:
label = 'median_house_value'
features = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'ocean_proximity'
]

### Data preparation

In [8]:
df[features].dtypes

latitude              float64
longitude             float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
dtype: object

In [9]:
df.median_house_value.head()

0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64

In [10]:
df[features] = df[features].fillna(0)

In [11]:
df[features].isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [12]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [14]:
df_full = df[features + [label]]
df_full.columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'median_house_value'],
      dtype='object')

In [15]:
df_full['rooms_per_household'] = df_full['total_rooms'] / df_full['households']
df_full['bedrooms_per_room'] = df_full['total_bedrooms'] / df_full['total_rooms']
df_full['population_per_household'] = df_full['population'] / df_full['households']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['rooms_per_household'] = df_full['total_rooms'] / df_full['households']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['bedrooms_per_room'] = df_full['total_bedrooms'] / df_full['total_rooms']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['population_per_household'] = df_fu

In [16]:
df_full.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
ocean_proximity              object
median_house_value          float64
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

#### Q1. 
* What is the most frequent observation (mode) for the column ocean_proximity?

In [22]:
max_val = df_full.ocean_proximity.value_counts().max()
df_full.ocean_proximity.value_counts().where(lambda x : x == max_val).dropna()

<1H OCEAN    9136.0
Name: ocean_proximity, dtype: float64

#### Question 2
* Create the correlation matrix for the numerical features of your train dataset.
* In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

In [23]:
df_full.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
ocean_proximity              object
median_house_value          float64
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [26]:
categorical = list(df_full.dtypes[df_full.dtypes == 'object'].index)
categorical

['ocean_proximity']

In [28]:
numerical = [c for c in df_full.columns if c not in categorical and c != label]
numerical

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [29]:
import itertools

comb_features = list(itertools.combinations(numerical, 2))
comb_features

[('latitude', 'longitude'),
 ('latitude', 'housing_median_age'),
 ('latitude', 'total_rooms'),
 ('latitude', 'total_bedrooms'),
 ('latitude', 'population'),
 ('latitude', 'households'),
 ('latitude', 'median_income'),
 ('latitude', 'rooms_per_household'),
 ('latitude', 'bedrooms_per_room'),
 ('latitude', 'population_per_household'),
 ('longitude', 'housing_median_age'),
 ('longitude', 'total_rooms'),
 ('longitude', 'total_bedrooms'),
 ('longitude', 'population'),
 ('longitude', 'households'),
 ('longitude', 'median_income'),
 ('longitude', 'rooms_per_household'),
 ('longitude', 'bedrooms_per_room'),
 ('longitude', 'population_per_household'),
 ('housing_median_age', 'total_rooms'),
 ('housing_median_age', 'total_bedrooms'),
 ('housing_median_age', 'population'),
 ('housing_median_age', 'households'),
 ('housing_median_age', 'median_income'),
 ('housing_median_age', 'rooms_per_household'),
 ('housing_median_age', 'bedrooms_per_room'),
 ('housing_median_age', 'population_per_household'),

In [31]:
corr_matrix = df_full[numerical].corr()
corr_matrix

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,0.106389,-0.104112,0.002366
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.02754,0.084836,0.002476
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,-0.153277,0.125396,0.013191
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.133798,-0.174583,-0.024581
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.002717,0.122205,-0.028019
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.072213,0.031397,0.069863
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,-0.080598,0.059818,-0.027309
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.326895,-0.573836,0.018766
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,1.0,-0.387465,-0.004852
bedrooms_per_room,-0.104112,0.084836,0.125396,-0.174583,0.122205,0.031397,0.059818,-0.573836,-0.387465,1.0,0.003047


#### Make `median_house_value` binary

* We need to turn the `median_house_value` variable from numeric into binary.
* Let's create a variable `above_average` which is 1 if the `median_house_value` is above its mean value and 0 otherwise.

In [42]:
mean_house_val = df_full.median_house_value.mean()
df_full['above_average'] = (df_full['median_house_value'] > mean_house_val).astype(int)
label = 'above_average'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['above_average'] = (df_full['median_house_value'] > mean_house_val).astype(int)


In [43]:
df_full.above_average.head()

0    1
1    1
2    1
3    1
4    1
Name: above_average, dtype: int64

### Split the data
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value (median_house_value) is not in your dataframe.

In [39]:
from sklearn.model_selection import train_test_split

In [52]:
df_full[numerical + categorical + [label]].columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household',
       'ocean_proximity', 'above_average'],
      dtype='object')

In [53]:
df_full_train, df_test = train_test_split(df_full[numerical + categorical + [label]], test_size=0.2, random_state=42)

len(df_full_train), len(df_test)

(16512, 4128)

In [54]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [55]:
df_train = df_train.reset_index(drop= True)
df_test = df_test.reset_index(drop= True)
df_val = df_val.reset_index(drop= True)

In [56]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [57]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [58]:
df_train.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
ocean_proximity              object
dtype: object

In [60]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train.isnull().sum()

latitude                    0
longitude                   0
housing_median_age          0
total_rooms                 0
total_bedrooms              0
population                  0
households                  0
median_income               0
rooms_per_household         0
bedrooms_per_room           0
population_per_household    0
ocean_proximity             0
above_average               0
dtype: int64

In [61]:
from sklearn.metrics import mutual_info_score

In [64]:
round(mutual_info_score(df_full_train.above_average, df_full_train.ocean_proximity),2)

0.1

#### Question 4
* Now let's train a logistic regression
* Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
* Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [65]:
from sklearn.feature_extraction import DictVectorizer 
from sklearn.linear_model import LogisticRegression

In [67]:
train_dict = df_train.to_dict(orient='records')

dv =  DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)

In [68]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

model.fit(X_train, y_train)

In [69]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [75]:
# soft prediction
y_val_pred = model.predict_proba(X_val)[:,1]
val_above_avg = (y_val_pred >= 0.5)

val_above_avg

array([False, False,  True, ...,  True,  True, False])

In [79]:
## accuracy on validation dataset
org_accuracy = round(( y_val == val_above_avg).mean(), 2)
org_accuracy 


0.84

#### Question 5
* Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [78]:
new_features = categorical + numerical
new_features

['ocean_proximity',
 'latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [86]:
feature_elimination_list = []
for fet in new_features:
    print('Excluding the feature: ', fet)

    features_exc = [f for f in features if f != fet]
    df_train_exc = df_train[features_exc]
    df_val_exc = df_val[features_exc]

    train_exc_dict = df_train_exc.to_dict(orient='records')
    val_exc_dict = df_val_exc.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)

    X_train_exc = dv.fit_transform(train_exc_dict)
    X_val_exc = dv.transform(val_exc_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_exc, y_train)

    y_val_pred_exc = model.predict_proba(X_val_exc)[:,1]
    val_above_avg_exc = (y_val_pred_exc >= 0.5)

    accuracy_exc = round(( y_val == val_above_avg_exc).mean(), 2)

    feature_elimination = {}

    feature_elimination['feature'] = fet
    feature_elimination['accuracy'] = accuracy_exc
    feature_elimination['diff'] = round(abs(org_accuracy - accuracy_exc), 3)

    feature_elimination_list.append(feature_elimination)
    

    

Excluding the feature:  ocean_proximity
Excluding the feature:  latitude
Excluding the feature:  longitude
Excluding the feature:  housing_median_age
Excluding the feature:  total_rooms
Excluding the feature:  total_bedrooms
Excluding the feature:  population
Excluding the feature:  households
Excluding the feature:  median_income
Excluding the feature:  rooms_per_household
Excluding the feature:  bedrooms_per_room
Excluding the feature:  population_per_household


In [87]:
feature_elimination_list


[{'feature': 'ocean_proximity', 'accuracy': 0.82, 'diff': 0.02},
 {'feature': 'latitude', 'accuracy': 0.83, 'diff': 0.01},
 {'feature': 'longitude', 'accuracy': 0.83, 'diff': 0.01},
 {'feature': 'housing_median_age', 'accuracy': 0.83, 'diff': 0.01},
 {'feature': 'total_rooms', 'accuracy': 0.84, 'diff': 0.0},
 {'feature': 'total_bedrooms', 'accuracy': 0.84, 'diff': 0.0},
 {'feature': 'population', 'accuracy': 0.82, 'diff': 0.02},
 {'feature': 'households', 'accuracy': 0.83, 'diff': 0.01},
 {'feature': 'median_income', 'accuracy': 0.79, 'diff': 0.05},
 {'feature': 'rooms_per_household', 'accuracy': 0.84, 'diff': 0.0},
 {'feature': 'bedrooms_per_room', 'accuracy': 0.84, 'diff': 0.0},
 {'feature': 'population_per_household', 'accuracy': 0.84, 'diff': 0.0}]