In [1]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

dv = DictVectorizer(sparse=False)
seed = 42

In [3]:
df = pd.read_csv("housing.csv")
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [4]:
df = df.fillna(0)

In [5]:
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"] = df["population"]/df["households"]

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


In [7]:
y = df.pop("median_house_value")

#### Q1. What is the most frequent observation (mode) for the column ocean_proximity?

In [8]:
df["ocean_proximity"].mode()[0]

'<1H OCEAN'

#### <1H OCEAN

In [9]:
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=seed)
df_train, df_valid, y_train, y_valid = train_test_split(df_train, y_train, test_size=0.25, random_state=seed)

df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

#### Q2.

    Create the correlation matrix for the numerical features of your train dataset.
        In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
    What are the two features that have the biggest correlation in this dataset?


In [10]:
df_train.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.925005,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
latitude,-0.925005,1.0,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
housing_median_age,-0.099812,0.002477,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,0.036449,-0.025914,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,0.06384,-0.05973,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,0.09167,-0.100272,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,0.049762,-0.063529,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.016426,-0.076805,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,-0.034814,0.119118,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,0.10232,-0.124507,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [11]:
l = (df_train.dtypes != object).sum()
df_train.corr().abs().unstack().sort_values(ascending=False)[l:l+1]

households  total_bedrooms    0.979399
dtype: float64

#### total_bedrooms *and* households

#### Q3.

    Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.


In [12]:
above_average = (y_train >= y_train.mean()).astype(int)
round(mutual_info_score(above_average, df_train["ocean_proximity"]), 2)

0.1

#### 0.10

#### Q4.

    Now let's train a logistic regression
    Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
    Fit the model on the training dataset.
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [13]:
train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = df_valid.to_dict(orient="records")
X_valid = dv.fit_transform(val_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=seed)

model.fit(X_train, above_average)

In [14]:
y_valid = (y_valid >= y_train.mean()).astype(int)
y_pred_valid = model.predict(X_valid)

acc = accuracy_score(y_valid, y_pred_valid)
# acc = (y_pred_valid == y_valid).mean()

print(round(acc, 2))

0.84


#### 0.84

#### Q5.

    Let's find the least useful feature using the feature elimination technique.
    Train a model with all these features (using the same parameters as in Q4).
    Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
    For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
    Which of following feature has the smallest difference?

In [15]:
cols = []
diffs = []

print("Feature Eliminated\tDifference from original accuracy")

for col in df_train.columns:    
    # print(col)
    
    df_train_tmp = df_train.drop(col, axis=1)
    # print(df_train_tmp.head(2))
    train_dict_tmp = df_train_tmp.to_dict(orient="records")
    X_train_tmp = dv.fit_transform(train_dict_tmp)
    
    df_valid_tmp = df_valid.drop(col, axis=1)
    val_dict_tmp = df_valid_tmp.to_dict(orient="records")
    X_valid_tmp = dv.fit_transform(val_dict_tmp)
    
    model_tmp = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=seed)
    model_tmp.fit(X_train_tmp, above_average)
    
    y_pred_valid_tmp = model_tmp.predict(X_valid_tmp)
    acc2 = accuracy_score(y_valid, y_pred_valid_tmp)
    
    diff = round(abs(acc - acc2), 5)
    
    print(f"{col}\t\t{diff}")
    
    cols.append(col)
    diffs.append(diff)

Feature Eliminated	Difference from original accuracy
longitude		0.00703
latitude		0.00291
housing_median_age		0.00557
total_rooms		0.00024
total_bedrooms		0.00048
population		0.01017
households		0.00339
median_income		0.05063
ocean_proximity		0.01599
rooms_per_household		0.00145
bedrooms_per_room		0.00097
population_per_household		0.00073


In [16]:
least_val_feature, least_val = min(zip(cols, diffs), key=lambda x: x[1])

print(least_val)
print(least_val_feature)

0.00024
total_rooms


#### total_rooms

#### Q6.

    For this question, we'll see how to use a linear regression model from Scikit-Learn
    We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
    Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
    This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
    Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest alpha.

In [17]:
alphas = [0, 0.01, 0.1, 1, 10]
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

rmses = []
print("Alpha\tRMSE")
for a in alphas:
    model_ridge = Ridge(alpha=a, solver="sag", random_state=seed)
        
    model_ridge.fit(X_train, y_train_log)
    y_pred_ridge = model_ridge.predict(X_valid)
    
    rmse = round(mean_squared_error(y_pred_ridge, y_valid_log, squared=False), 3)
    
    rmses.append(rmse)
    print(a, "\t", rmse)

NameError: name 'np' is not defined

In [None]:
idx_min = [i for i, value in enumerate(rmses) if value==min(rmses)]
min([alphas[i] for i in idx_min])

#### 0