In [1]:
import urllib.request
import pandas as pd
import numpy as np

# 1. Reading data

In [2]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
filename = 'data.csv'
urllib.request.urlretrieve(url, filename)

('data.csv', <http.client.HTTPMessage at 0x26c3c5f82e0>)

In [3]:
df = pd.read_csv('data.csv', header=0, sep=',')

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# 2. Data preparation

## 2.1. Fill missed values with 0

In [5]:
series_s = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity'
    ]

In [6]:
df = df[series_s]

In [7]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df = df.fillna(0)

In [9]:
df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

## 2.2. Create additional columns: rooms_per_household, bedrooms_per_room, population_per_household

In [10]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']
df[['rooms_per_household', 'bedrooms_per_room', 'population_per_household']].head()

Unnamed: 0,rooms_per_household,bedrooms_per_room,population_per_household
0,6.984127,0.146591,2.555556
1,6.238137,0.155797,2.109842
2,8.288136,0.129516,2.80226
3,5.817352,0.184458,2.547945
4,6.281853,0.172096,2.181467


### Q1. What is the most frequent observation (mode) for the column ocean_proximity?

In [11]:
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

# 3. Setting up the validation framework

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [14]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# 4. Exploratory data analysis

## 4.1. Explore the correlation of numerical feature variables

### Q2. Create the correlation matrix for the numerical features of your train dataset. What are the two features that have the biggest correlation in this dataset?

In [17]:
df_train.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [18]:
numerical = df_train.dtypes.index.tolist()
numerical.remove('ocean_proximity')
print(numerical)

['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household']


In [19]:
df_train_cor = df_train[numerical].corr()
df_train_cor

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [20]:
sorted_df_train_cor = df_train_cor.unstack().sort_values()
sorted_df_train_cor[:5]

latitude           longitude             -0.925005
longitude          latitude              -0.925005
median_income      bedrooms_per_room     -0.616617
bedrooms_per_room  median_income         -0.616617
                   rooms_per_household   -0.500589
dtype: float64

In [21]:
sorted_df_train_cor[105:115]

total_rooms          households             0.921441
total_bedrooms       total_rooms            0.931546
total_rooms          total_bedrooms         0.931546
households           total_bedrooms         0.979399
total_bedrooms       households             0.979399
latitude             latitude               1.000000
population           population             1.000000
rooms_per_household  rooms_per_household    1.000000
median_income        median_income          1.000000
households           households             1.000000
dtype: float64

## 4.2. Make median_house_value binary 

In [22]:
average_y_train = y_train.mean()
y_train_convert = (y_train > average_y_train).astype(int)
average_y_val = y_val.mean()
y_val_convert = (y_val > average_y_val).astype(int)
average_y_test = y_test.mean()
y_test_convert = (y_test > average_y_test).astype(int)

### Q3. Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only. What is the value of mutual information? Round it to 2 decimal digits 

In [23]:
from sklearn.metrics import mutual_info_score
score = mutual_info_score(df_train['ocean_proximity'],y_train_convert)
score

0.10138385763624205

In [24]:
round(score, 2)

0.1

# 5. Logistic regression

## 5.1. One-hot encoding

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
dv = DictVectorizer(sparse=False)
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [27]:
# check series names
dv.get_feature_names_out()

array(['bedrooms_per_room', 'households', 'housing_median_age',
       'latitude', 'longitude', 'median_income',
       'ocean_proximity=<1H OCEAN', 'ocean_proximity=INLAND',
       'ocean_proximity=ISLAND', 'ocean_proximity=NEAR BAY',
       'ocean_proximity=NEAR OCEAN', 'population',
       'population_per_household', 'rooms_per_household',
       'total_bedrooms', 'total_rooms'], dtype=object)

## 5.2. Train logistic regression

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train_convert)

### Q4. Calculate the accuracy on validation dataset

In [29]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

In [30]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.07855422, 0.16619336, 0.95533326, ..., 0.96350977, 0.85584745,
       0.45446759])

In [31]:
above_mean_price = (y_pred >= 0.5)
(y_val_convert == above_mean_price).mean()

0.8362403100775194

In [32]:
full_accuracy = round(_, 2)

### Q5. Let's find the least useful feature using the feature elimination technique. Train a model with all these features (using the same parameters as in Q4). Now exclude each feature from this set and train a model without it. Record the accuracy for each model. For each feature, calculate the difference between the original accuracy and the accuracy without the feature. Which of following feature has the smallest difference: total_rooms, total_bedrooms, population, households?

In [33]:
feature_elimination = dict()

In [34]:
# get list of feature names
# if we need all features, then  feature_list = list(df_train.columns)
feature_list = ['total_rooms', 'total_bedrooms', 'population', 'households']
for feature in feature_list:
    df_train_elim = df_train.copy()
    del df_train_elim[feature]
    train_dicts_elim = df_train_elim.to_dict(orient='records')
    X_train_elim = dv.fit_transform(train_dicts_elim)
    model_elim = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model_elim.fit(X_train_elim, y_train_convert)
    df_val_elim = df_val.copy()
    del df_val_elim[feature]
    val_dicts_elim = df_val_elim.to_dict(orient='records')
    X_val_elim = dv.fit_transform(val_dicts_elim)
    y_pred_elim = model_elim.predict_proba(X_val_elim)[:, 1]
    above_mean_price_elim = (y_pred_elim >= 0.5)
    accuracy = (y_val_convert == above_mean_price_elim).mean()
    feature_elimination[feature] = full_accuracy - accuracy

In [35]:
feature_elimination

{'total_rooms': 0.0030329457364340895,
 'total_bedrooms': 0.0006104651162790464,
 'population': 0.012480620155038702,
 'households': 0.00618217054263559}

In [36]:
min(feature_elimination, key=feature_elimination.get)

'total_bedrooms'

# Linear regression with regularization

### Q6. We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column. Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data. This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]. Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

In [37]:
import numpy as np

In [38]:
y_train_lin = np.log1p(y_train)
y_val_lin = np.log1p(y_val)

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

alpha_rmse = dict()
a_list = [0, 0.01, 0.1, 1, 10]
for a in a_list:
    model_lin = Ridge(alpha=a, solver="sag", random_state=42)
    model_lin.fit(X_train, y_train_lin)
    y_pred_lin = model_lin.predict(X_val)
    RMSE = np.sqrt(mean_squared_error(y_pred_lin, y_val_lin))
    alpha_rmse[a] = round(RMSE, 3)

In [39]:
alpha_rmse

{0: 0.524, 0.01: 0.524, 0.1: 0.524, 1: 0.524, 10: 0.524}