In [218]:
import pandas as pd
import numpy as np

## Data Preparation

In [219]:
data = pd.read_csv('housing.csv')

In [220]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [221]:
columns = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity'
]

In [222]:
df = data[columns]
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [223]:
df.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [224]:
df.fillna(0, inplace = True)

In [225]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms'] 
df['population_per_household'] = df['population'] / df['households']

In [14]:
df.

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209


## Question 1

What is the most frequent observation (mode) for the column ocean_proximity?

Options:

- NEAR BAY
- <1H OCEAN
- INLAND
- NEAR OCEAN

In [27]:
df.groupby("ocean_proximity")["ocean_proximity"].count().sort_values(ascending=False)

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

Answer: <1H OCEAN

### Split the Data

In [171]:
from sklearn.model_selection import train_test_split

In [172]:
df_fulltrain, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [173]:
len(df_fulltrain), len(df_test)

(16512, 4128)

In [174]:
df_train, df_val = train_test_split(df_fulltrain, test_size = 0.25, random_state = 42)

In [175]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [176]:
df_fulltrain = df_fulltrain.reset_index(drop = True)
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [177]:
y_fulltrain = df_fulltrain['median_house_value']
y_train = df_train['median_house_value']
y_val = df_val['median_house_value']
y_test = df_test['median_house_value']

In [178]:
del df_fulltrain['median_house_value']
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

## Question 2

Create the correlation matrix for the numerical features of your train dataset.
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
What are the two features that have the biggest correlation in this dataset?

Options:

- total_bedrooms and households
- total_bedrooms and total_rooms
- population and households
- population_per_household and total_rooms

In [179]:
df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456


In [180]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12384 entries, 0 to 12383
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   latitude                  12384 non-null  float64
 1   longitude                 12384 non-null  float64
 2   housing_median_age        12384 non-null  float64
 3   total_rooms               12384 non-null  float64
 4   total_bedrooms            12384 non-null  float64
 5   population                12384 non-null  float64
 6   households                12384 non-null  float64
 7   median_income             12384 non-null  float64
 8   ocean_proximity           12384 non-null  object 
 9   rooms_per_household       12384 non-null  float64
 10  bedrooms_per_room         12384 non-null  float64
 11  population_per_household  12384 non-null  float64
dtypes: float64(11), object(1)
memory usage: 1.1+ MB


In [181]:
df.columns.values

array(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'], dtype=object)

In [182]:
numerical = [
    'latitude', 'longitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income',
    'rooms_per_household','bedrooms_per_room', 'population_per_household'
]

categorical = [
    'ocean_proximity'
]

In [183]:
corr_train = df_train.corr()

In [184]:
corr_train.replace(1,'-')

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,-,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,-,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,-,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,-,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,-,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,-,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,-,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,-,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,-,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,-,-0.002851


Answer: total_bedrooms and households

### Make median_house_value binary

In [185]:
y_train.mean()

206807.7419250646

In [186]:
y_train

0        241400.0
1        500001.0
2         64100.0
3        277200.0
4        417000.0
           ...   
12379    154200.0
12380    146400.0
12381    215300.0
12382    139000.0
12383    181300.0
Name: median_house_value, Length: 12384, dtype: float64

In [187]:
y_train > y_train.mean()

0         True
1         True
2        False
3         True
4         True
         ...  
12379    False
12380    False
12381     True
12382    False
12383    False
Name: median_house_value, Length: 12384, dtype: bool

In [188]:
above_average = y_train > y_train.mean()

In [189]:
above_average = above_average.astype(int)

In [190]:
above_average

0        1
1        1
2        0
3        1
4        1
        ..
12379    0
12380    0
12381    1
12382    0
12383    0
Name: median_house_value, Length: 12384, dtype: int64

## Question 3
Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
What is the value of mutual information?
Round it to 2 decimal digits using round(score, 2)

Options:
- 0.26
- 0
- 0.10
- 0.16

In [191]:
from sklearn.metrics import mutual_info_score

In [192]:
mutual_info_score(above_average, df_train[categorical].squeeze())

0.10138385763624205

Answer: 0.10

## Question 4
Now let's train a logistic regression
Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

Options:
- 0.60
- 0.72
- 0.84
- 0.95

In [252]:
df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,1,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,1,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,1,NEAR BAY,8.288136,0.129516,2.802260
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,1,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,1,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,0,INLAND,5.045455,0.224625,2.560606
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,0,INLAND,6.114035,0.215208,3.122807
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,0,INLAND,5.205543,0.215173,2.325635
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,0,INLAND,5.329513,0.219892,2.123209


In [227]:
from sklearn.model_selection import train_test_split

In [228]:
df_fulltrain, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_fulltrain, test_size = 0.25, random_state = 42)

df_fulltrain = df_fulltrain.reset_index(drop = True)
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [229]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [230]:
y_fulltrain = df_fulltrain['median_house_value']
y_train = df_train['median_house_value']
y_val = df_val['median_house_value']
y_test = df_test['median_house_value']

del df_fulltrain['median_house_value']
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [231]:
from sklearn.feature_extraction import DictVectorizer

In [261]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [262]:
from sklearn.linear_model import LogisticRegression

In [263]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [264]:
model.fit(X_train, y_train)

In [265]:
model.intercept_[0]

-0.083829188440655

In [266]:
model.coef_[0].round(3)

array([ 0.181,  0.004,  0.036,  0.116,  0.087,  1.212,  0.473, -1.715,
        0.019,  0.292,  0.847, -0.002,  0.01 , -0.014,  0.002, -0.   ])

In [267]:
y_pred = model.predict_proba(X_val)[:, 1]

In [268]:
y_pred

array([0.07908992, 0.1749244 , 0.95259667, ..., 0.96131742, 0.85202637,
       0.47117608])

In [269]:
y_hardpred = (y_pred >= 0.5)

In [270]:
orig_score = (y_val == y_hardpred).mean()

In [271]:
orig_score

0.8362403100775194

Answer: 0.84

## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?
- total_rooms
- total_bedrooms
- population
- households

note: the difference doesn't have to be positive

In [253]:
features = numerical + categorical

In [254]:
features

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household',
 'ocean_proximity']

In [259]:
score = []
for i in features:
    dv = DictVectorizer(sparse=False)

    train_dict = df_train.drop(i, axis=1).to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.drop(i, axis=1).to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    
    y_hardpred = (y_pred >= 0.5)
    
    current_score = (y_val == y_hardpred).mean()
    
    score.append(current_score)

In [260]:
score

[0.8338178294573644,
 0.8311531007751938,
 0.8313953488372093,
 0.8369670542635659,
 0.8357558139534884,
 0.8263081395348837,
 0.8330910852713178,
 0.7863372093023255,
 0.8355135658914729,
 0.8347868217054264,
 0.8367248062015504,
 0.8195251937984496]

In [272]:
diff = score - orig_score

In [273]:
diff

array([-0.00242248, -0.00508721, -0.00484496,  0.00072674, -0.0004845 ,
       -0.00993217, -0.00314922, -0.0499031 , -0.00072674, -0.00145349,
        0.0004845 , -0.01671512])

Answer: total_bedrooms

## Question 6

For this question, we'll see how to use a linear regression model from Scikit-Learn
We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
If there are multiple options, select the smallest alpha.

Options:
- 0
- 0.01
- 0.1
- 1
- 10

In [320]:
new_df = data[columns]

In [329]:
new_df = new_df.fillna(0)

In [330]:
new_df['rooms_per_household'] = new_df['total_rooms'] / new_df['households']
new_df['bedrooms_per_room'] = new_df['total_bedrooms'] / new_df['total_rooms'] 
new_df['population_per_household'] = new_df['population'] / new_df['households']

In [331]:
new_df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,13.022766,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,12.789687,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,12.771673,NEAR BAY,8.288136,0.129516,2.802260
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,12.740520,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,12.743154,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,11.265758,INLAND,5.045455,0.224625,2.560606
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,11.252872,INLAND,6.114035,0.215208,3.122807
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,11.432810,INLAND,5.205543,0.215173,2.325635
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,11.346883,INLAND,5.329513,0.219892,2.123209


In [332]:
new_df['median_house_value'] = np.log1p(new_df['median_house_value'])

In [333]:
df_fulltrain, df_test = train_test_split(new_df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_fulltrain, test_size = 0.25, random_state = 42)

df_fulltrain = df_fulltrain.reset_index(drop = True)
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [334]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [335]:
y_fulltrain = df_fulltrain['median_house_value']
y_train = df_train['median_house_value']
y_val = df_val['median_house_value']
y_test = df_test['median_house_value']

del df_fulltrain['median_house_value']
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [336]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [337]:
alpha_values = [0, 0.01, 0.1, 1, 10]

rmse = []
for a in alpha_values:
    ## One-hot encoding
    dv = DictVectorizer(sparse=False)

    train_dict = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    ## Model Training
    model = Ridge(alpha=a, solver="sag", random_state=42)
    
    model.fit(X_train, y_train)
    
    pred = model.predict(X_val)
    
    rmse_score = np.sqrt(mean_squared_error(y_test,pred))
    
    rmse.append(rmse_score)           

In [338]:
rmse

[0.04661912797387294,
 0.04661912797297334,
 0.046619127964429206,
 0.046619127878536774,
 0.04661912702096303]

Answer: 0