In [421]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, mean_squared_error

In [422]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [423]:

df = pd.read_csv('housing.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


Data preparation

In [424]:
df.fillna(0, inplace=True)
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [425]:
# Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
df['rooms_per_household'] = df.total_rooms / df.households

# Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms

# Create a new column population_per_household by dividing the column population by the column households from dataframe.
df['population_per_household'] = df.population / df.households

df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


In [426]:
# Question 1: What is the most frequent observation (mode) for the column ocean_proximity?

df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

Split the data

In [427]:
df_full_train, df_test = train_test_split(df, test_size=.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=.2, random_state=42)
df_train.shape, df_val.shape


((13209, 13), (3303, 13))

In [428]:

df_train.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)


y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

Question 2 
- Create the correlation matrix for the numerical features of your train dataset.
- `In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.`
- What are the two features that have the biggest correlation in this dataset?


In [429]:
df_train.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [430]:
numerical = df_train.select_dtypes(include=np.number).columns.tolist()

In [431]:
df_train[numerical].corr()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924648,-0.099794,0.039035,0.064607,0.0937,0.050609,-0.013163,-0.035467,0.099603,0.0123
latitude,-0.924648,1.0,0.003247,-0.028851,-0.060535,-0.102423,-0.064677,-0.080827,0.120881,-0.121119,-0.003891
housing_median_age,-0.099794,0.003247,1.0,-0.362182,-0.322832,-0.290909,-0.304716,-0.125801,-0.180858,0.133877,0.012429
total_rooms,0.039035,-0.028851,-0.362182,1.0,0.931785,0.853763,0.920937,0.201013,0.161927,-0.194255,-0.029712
total_bedrooms,0.064607,-0.060535,-0.322832,0.931785,1.0,0.87781,0.979132,-0.007004,0.007617,0.077486,-0.034761
population,0.0937,-0.102423,-0.290909,0.853763,0.87781,1.0,0.907327,0.002083,-0.077202,0.030611,0.064448
households,0.050609,-0.064677,-0.304716,0.920937,0.979132,0.907327,1.0,0.014912,-0.086746,0.057382,-0.032996
median_income,-0.013163,-0.080827,-0.125801,0.201013,-0.007004,0.002083,0.014912,1.0,0.382606,-0.616635,-0.000848
rooms_per_household,-0.035467,0.120881,-0.180858,0.161927,0.007617,-0.077202,-0.086746,0.382606,1.0,-0.488754,0.00187
bedrooms_per_room,0.099603,-0.121119,0.133877,-0.194255,0.077486,0.030611,0.057382,-0.616635,-0.488754,1.0,-0.002556


In [432]:
df_train.corr().unstack().sort_values().drop_duplicates()


longitude                 latitude                   -0.924648
median_income             bedrooms_per_room          -0.616635
rooms_per_household       bedrooms_per_room          -0.488754
total_rooms               housing_median_age         -0.362182
housing_median_age        total_bedrooms             -0.322832
households                housing_median_age         -0.304716
population                housing_median_age         -0.290909
total_rooms               bedrooms_per_room          -0.194255
housing_median_age        rooms_per_household        -0.180858
                          median_income              -0.125801
latitude                  bedrooms_per_room          -0.121119
                          population                 -0.102423
longitude                 housing_median_age         -0.099794
households                rooms_per_household        -0.086746
median_income             latitude                   -0.080827
population                rooms_per_household        -0

In [433]:
y_train_binary = (y_train > y_train.mean()).astype(int)
y_val_binary = (y_val > y_val.mean()).astype(int)
y_test_binary = (y_test > y_test.mean()).astype(int)

In [434]:
y_train.mean()

206691.72639866758

In [435]:
pd.DataFrame(list(zip(y_train, y_train_binary)), columns=['value', 'bin']).tail()

Unnamed: 0,value,bin
13204,154200.0,0
13205,146400.0,0
13206,215300.0,1
13207,139000.0,0
13208,181300.0,0


In [436]:
df_train['above_average'] = (y_train > y_train.mean()).astype(int)
df_val['above_average'] = (y_val > y_val.mean()).astype(int)
df_test['above_average'] = (y_test > y_test.mean()).astype(int)

In [437]:
df_train.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
above_average                 int64
dtype: object

In [438]:
df_train['ocean_proximity'].nunique()


5

In [439]:
df_train.above_average.shape, df_train['ocean_proximity'].shape

((13209,), (13209,))

In [440]:
mi_score = mutual_info_score(df_train.ocean_proximity, df_train.above_average)
round(mi_score, 2)

0.1

Question 4
- Now let's train a `logistic regression`
- Remember that we have one categorical variable `ocean_proximity` in the data. Include it using one-hot encoding.
- Fit the model on the `training dataset`.
    To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = `LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)`
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [441]:
train_dict = df_train[['ocean_proximity'] + numerical].to_dict(orient='records')
val_dict = df_val[['ocean_proximity'] + numerical].to_dict(orient='records')

In [442]:
train_dict[0]

{'ocean_proximity': 'INLAND',
 'longitude': -120.97,
 'latitude': 38.0,
 'housing_median_age': 27.0,
 'total_rooms': 1683.0,
 'total_bedrooms': 288.0,
 'population': 873.0,
 'households': 258.0,
 'median_income': 4.7069,
 'rooms_per_household': 6.523255813953488,
 'bedrooms_per_room': 0.1711229946524064,
 'population_per_household': 3.383720930232558}

In [443]:

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

X_train.shape, X_val.shape

((13209, 16), (3303, 16))

In [444]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train_binary)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [445]:
_y_val_pred = model.predict(X_val)

In [446]:
original_accuracy = accuracy_score(_y_val_pred, y_val_binary)
round(original_accuracy, 2)

0.83

Question 5
- Let's find the least useful feature using the `feature elimination technique`.

In [447]:
print(f'{"Feature importance"}')
print(f'{"-"*30}')
for feature in df_train.columns.to_list():
  train, val = df_train.copy(), df_val.copy()

  # Drop the feature from dataset
  train.drop([feature], axis=1, inplace=True)
  val.drop([feature], axis=1, inplace=True)

  # Recompute numerical columns
  numerical = train.select_dtypes(include=np.number).columns.tolist()

  if feature != 'ocean_proximity':
    # Encode 
    train_dict = train[['ocean_proximity'] + numerical].to_dict(orient='records')
    val_dict = val[['ocean_proximity'] + numerical].to_dict(orient='records')
  else:
    train_dict = train[numerical].to_dict(orient='records')
    val_dict = val[numerical].to_dict(orient='records')

  X_train = dv.transform(train_dict)
  X_val = dv.transform(val_dict)
  
  # Train
  model.fit(X_train, y_train_binary)

  # Evaluate
  _y_val_pred = model.predict(X_val)

  # Compute accuracy
  Accuracy = accuracy_score(_y_val_pred, y_val_binary)



  print (f'{feature:20s} | Accuracy: {Accuracy} | Diff: {np.abs(Accuracy - original_accuracy)}')





Feature importance
------------------------------
longitude            | Accuracy: 0.8219800181653043 | Diff: 0.006963366636391077
latitude             | Accuracy: 0.8265213442325159 | Diff: 0.0024220405691794955
housing_median_age   | Accuracy: 0.8240993036633364 | Diff: 0.004844081138358991
total_rooms          | Accuracy: 0.8289433848016954 | Diff: 0.0
total_bedrooms       | Accuracy: 0.8298516500151377 | Diff: 0.0009082652134423386
population           | Accuracy: 0.820466242809567 | Diff: 0.008477141992128345
households           | Accuracy: 0.825915834090221 | Diff: 0.003027550711474425
median_income        | Accuracy: 0.7798970632758099 | Diff: 0.049046321525885506
ocean_proximity      | Accuracy: 0.8159249167423555 | Diff: 0.013018468059339927
rooms_per_household  | Accuracy: 0.8292461398728429 | Diff: 0.0003027550711475202
bedrooms_per_room    | Accuracy: 0.8280351195882532 | Diff: 0.0009082652134422275
population_per_household | Accuracy: 0.8289433848016954 | Diff: 0.0
above_

Question 6

In [452]:

q6_train, q6_val = train_test_split(df_full_train, test_size=.2, random_state=42)

q6_train.reset_index(inplace=True, drop=True)
q6_val.reset_index(inplace=True, drop=True)


q6_train['median_house_value'] = np.log1p(q6_train.median_house_value)
q6_val['median_house_value'] = np.log1p(q6_val.median_house_value)

y_train = q6_train.median_house_value.values
y_val = q6_val.median_house_value.values


del q6_train['median_house_value']
del q6_val['median_house_value']

# q6_train.head().T

# Encode 
train_dict = q6_train[['ocean_proximity'] + numerical].to_dict(orient='records')
val_dict = q6_val[['ocean_proximity'] + numerical].to_dict(orient='records')
X_train = dv.transform(train_dict)
X_val = dv.transform(val_dict)

# Train
for alpha in [0, 0.01, 0.1, 1, 10]:
  model = Ridge(alpha=alpha, solver="sag", random_state=42)
  print(model)
  model.fit(X_train, y_train)
  # Evaluate
  _y_val_pred = model.predict(X_val)
    # Compute accuracy
  RMSE = np.sqrt(mean_squared_error(_y_val_pred, y_val))
  print (f'Alpha {alpha:<5} | RMSE: {RMSE}')


Ridge(alpha=0, random_state=42, solver='sag')
Alpha 0     | RMSE: 0.5234968906587238
Ridge(alpha=0.01, random_state=42, solver='sag')
Alpha 0.01  | RMSE: 0.5234968906810293
Ridge(alpha=0.1, random_state=42, solver='sag')
Alpha 0.1   | RMSE: 0.5234968908817702
Ridge(alpha=1, random_state=42, solver='sag')
Alpha 1     | RMSE: 0.5234968928724483
Ridge(alpha=10, random_state=42, solver='sag')
Alpha 10    | RMSE: 0.5234969128126647


In [451]:
y_val

array([11.47937902, 11.23190118, 12.97363364, ..., 12.63003561,
       12.12541016, 12.25200635])