In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('housing.csv')
df.head()

df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

# Data preparation
* Select only the features from above and fill in the missing values with 0.
* Create a new column `rooms_per_household` by dividing the column `total_rooms` by the column `households` from dataframe. 
* Create a new column `bedrooms_per_room` by dividing the column `total_bedrooms` by the column `total_rooms` from dataframe. 
* Create a new column `population_per_household` by dividing the column `population` by the column `households` from dataframe. 

In [2]:
df.total_bedrooms = df.total_bedrooms.fillna(0)
df['rooms_per_household'] = df.total_rooms / df.households
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms
df['population_per_household'] = df.population / df.households
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


# Question 1

What is the most frequent observation (mode) for the column `ocean_proximity`?

Options:
* `NEAR BAY`
* `<1H OCEAN`  <--
* `INLAND`
* `NEAR OCEAN`

In [5]:
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

# Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to 42.
* Make sure that the target value (`median_house_value`) is not in your dataframe.

In [58]:
from sklearn.model_selection import train_test_split

def get_split_data(data, seed, y_var_name):
    df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=seed)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train[y_var_name].values
    y_val = df_val[y_var_name].values
    y_test = df_test[y_var_name].values

    del df_train[y_var_name]
    del df_val[y_var_name]
    del df_test[y_var_name]
    
    return df_train, df_val, df_test, y_train, y_val, y_test

# Question 2

* Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your train dataset.
    - In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

Options:
* `total_bedrooms` and `households`  <--
* `total_bedrooms` and `total_rooms`
* `population` and `households`
* `population_per_household` and `total_rooms

In [25]:
Numerical = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household']
c = df[Numerical].corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort").drop_duplicates()
print(so)


latitude                  population_per_household    0.002366
population_per_household  longitude                   0.002476
total_bedrooms            rooms_per_household         0.002717
bedrooms_per_room         population_per_household    0.003047
population                median_income               0.004834
                                                        ...   
households                total_rooms                 0.918484
total_rooms               total_bedrooms              0.920196
longitude                 latitude                    0.924664
total_bedrooms            households                  0.966507
longitude                 longitude                   1.000000
Length: 67, dtype: float64


# Make `median_house_value` binary

* We need to turn the `median_house_value` variable from numeric into binary.
* Let's create a variable `above_average` which is `1` if the `median_house_value` is above its mean value and `0` otherwise.

# Question 3

* Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
* What is the value of mutual information?
* Round it to 2 decimal digits using `round(score, 2)`

Options:
- 0.26
- 0
- 0.10  <--
- 0.16


In [74]:

data_class = df.copy()
median_house_value_mean = df.median_house_value.mean()
data_class['above_average'] = (df.median_house_value>=median_house_value_mean).astype(int)
del data_class["median_house_value"]

from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(data_class, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


from sklearn.metrics import mutual_info_score
score = mutual_info_score(df_train.ocean_proximity, df_train.above_average)
round(score, 2)





0.1

# Question 4

* Now let's train a logistic regression
* Remember that we have one categorical variable `ocean_proximity` in the data. Include it using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

Options:
- 0.60
- 0.72
- 0.84  <--
- 0.95

In [75]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df_train, df_val, df_test, y_train, y_val, y_test =  get_split_data(data_class, 42, "above_average")

# DataTrain one-hot encoding
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

# Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# DataVal one-hot encoding
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Prediction
y_pred = model.predict(X_val)

# Accuracy
accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.84


# Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 
* Which of following feature has the smallest difference? 
   * `total_rooms`  <--
   * `total_bedrooms` 
   * `population`
   * `households`

> **note**: the difference doesn't have to be positive



In [79]:
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household', 'bedrooms_per_room',
       'population_per_household']

In [83]:
reg = []

for f in features:
    subset = features.copy()
    subset.remove(f)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    
    reg.append([f, accuracy - score, score])

sorted(reg, key=lambda x:x[1])
    

[['total_rooms', 0.003517441860465098, 0.8364825581395349],
 ['rooms_per_household', 0.003517441860465098, 0.8364825581395349],
 ['bedrooms_per_room', 0.003759689922480547, 0.8362403100775194],
 ['population_per_household', 0.0042441860465115555, 0.8357558139534884],
 ['total_bedrooms', 0.004728682170542564, 0.8352713178294574],
 ['latitude', 0.005939922480620141, 0.8340600775193798],
 ['households', 0.006666666666666599, 0.8333333333333334],
 ['longitude', 0.006908914728682158, 0.8330910852713178],
 ['housing_median_age', 0.009089147286821642, 0.8309108527131783],
 ['population', 0.013691860465116279, 0.8263081395348837],
 ['ocean_proximity', 0.01974806201550383, 0.8202519379844961],
 ['median_income', 0.053662790697674434, 0.7863372093023255]]

# Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column `'median_house_value'`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model (`model = Ridge(alpha=a, solver="sag", random_state=42)`) on the training data.
* This model has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest `alpha`.

Options:
- 0  <--
- 0.01
- 0.1
- 1
- 10  

In [86]:
log_df = df.copy()
log_df['median_house_value']=np.log1p(log_df['median_house_value'])
log_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,13.022766,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,12.789687,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,12.771673,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,12.74052,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,12.743154,NEAR BAY,6.281853,0.172096,2.181467


In [88]:
df_train, df_val, df_test, y_train, y_val, y_test =  get_split_data(log_df, 42, "median_house_value")
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-119.67,34.43,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011
1,-118.32,33.74,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082
2,-121.62,39.13,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059
3,-118.63,34.24,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016
4,-122.3,37.52,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456


In [90]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
dv.fit(train_dict)
X_train = dv.transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [98]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 9))

0 0.524063571
0.01 0.524063571
0.1 0.524063571
1 0.524063573
10 0.524063589
