In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 1.Version of NumPy
print(np.__version__)
print(pd.__version__)
import wget

1.23.3
1.4.4


# Load Data

https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [2]:
df = pd.read_csv('housing.csv')

df.shape

(20640, 10)

In [3]:
df = df[['latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity']].fillna(0)
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
print(df.dtypes)
print()
print(df.isna().sum())

latitude              float64
longitude             float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [5]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [6]:
# Question 1 - What is the most frequent observation (mode) for the column ocean_proximity?

df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Splitting the data

In [7]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y = 'median_house_value'
# y_train = df_train[y].values
# y_val = df_val[y].values
# y_test = df_test[y].values

# del df_train[y]
# del df_val[y]
# del df_test[y]

print('train: ',round(df_train.shape[0]/df.shape[0],1))
print('validation: ',round(df_val.shape[0]/df.shape[0],1))
print('test: ',round(df_test.shape[0]/df.shape[0],1))

train:  0.6
validation:  0.2
test:  0.2


In [8]:
df_train.columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [9]:
# correlation
feats = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household']

float_list = df_train[feats].dtypes[df_train[feats].dtypes=='float64'].index
cor = df_train[float_list].corr()

L1= []
L2= []
L3 = []

for i in cor.columns:
    for j in cor.index:
        if i==j: continue
        else:
            L1.append(i)
            L2.append(j)
            L3.append(abs(cor.loc[j,i]))

# Question 2: Features with biggest correlation.
            
pd.DataFrame({'col_1':L1, 'col_2':L2, 'cor':L3}).sort_values(by='cor', ascending=False)

Unnamed: 0,col_1,col_2,cor
64,households,total_bedrooms,0.979399
45,total_bedrooms,households,0.979399
43,total_bedrooms,total_rooms,0.931546
33,total_rooms,total_bedrooms,0.931546
0,latitude,longitude,0.925005
...,...,...,...
108,population_per_household,rooms_per_household,0.001801
75,median_income,population,0.000849
56,population,median_income,0.000849
79,median_income,population_per_household,0.000454


# Create binary target

In [10]:
df['median_house_value'].median()

179700.0

In [11]:
y_mean = df['median_house_value'].mean()

for i in [df_train, df_val, df_test]:
    i['above_average'] = (i['median_house_value']>y_mean).astype('int64')
print(df_train['above_average'].mean())
print(df_val['above_average'].mean())
print(df_test['above_average'].mean())

0.4076227390180879
0.40818798449612403
0.4001937984496124


In [12]:
# Question 3: Value of mutual information.

from sklearn.metrics import mutual_info_score
round(mutual_info_score(df_train['above_average'], df_train['ocean_proximity']),2)

0.1

# Logistic regression

## one-hot encoding

In [13]:
y_base = 'median_house_value'
y_base_train = df_train[y_base].values
y_base_val = df_val[y_base].values
y_base_test = df_test[y_base].values

y = 'above_average'
y_train = df_train[y].values
y_val = df_val[y].values
y_test = df_test[y].values

for i in [df_train, df_val, df_test]:
    del i[y]
    del i[y_base]

df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456


In [14]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[feats].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[feats].to_dict(orient='records')
X_val = dv.transform(val_dict)


## regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [16]:
# Question 4: Accuracy of the model

y_pred = model.predict_proba(X_val)[:, 1]

base_model_accuracy=accuracy_score(y_val, y_pred >= 0.5)
base_model_accuracy

0.8355135658914729

#  feature elimination 

In [17]:
feats

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [18]:
# Question 5: Feature selection - the smallest difference in accuracy
L_acuracy = []
L_diff =[]
L_diff = []
for i in feats:
    feats_new=feats.copy()
    feats_new.remove(i)
    train_dict = df_train[feats_new].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[feats_new].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    L_acuracy.append(accuracy_score(y_val, y_pred >= 0.5))
    L_diff.append(accuracy_score(y_val, y_pred >= 0.5) -base_model_accuracy)
pd.DataFrame({'feat_exclude':feats, 'accuracy':L_acuracy,'accuracy_diff':L_diff}).sort_values(by='accuracy_diff')

Unnamed: 0,feat_exclude,accuracy,accuracy_diff
7,median_income,0.786337,-0.049176
8,ocean_proximity,0.82001,-0.015504
5,population,0.826308,-0.009205
2,housing_median_age,0.829942,-0.005572
0,latitude,0.832607,-0.002907
1,longitude,0.832849,-0.002665
6,households,0.833091,-0.002422
9,rooms_per_household,0.835998,0.000484
3,total_rooms,0.83624,0.000727
10,bedrooms_per_room,0.836483,0.000969


# linear regression model

In [19]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [20]:
y_base_train = np.log(y_base_train)
y_base_val = np.log(y_base_val)

In [21]:
# Question 6: Regression with Scikit-Learn. What's the best alpha?

L_mse =[]
for a in [0, 0.01, 0.1, 1, 10]:
    train_dict = df_train[feats].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[feats].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_base_train)
    y_pred = model.predict(X_val)
    L_mse.append(mean_squared_error(y_base_val, y_pred))
t = pd.DataFrame({'alpha':[0, 0.01, 0.1, 1, 10], 'mse':L_mse}).sort_values(by='mse')
t

Unnamed: 0,alpha,mse
0,0.0,0.274646
1,0.01,0.274646
2,0.1,0.274646
3,1.0,0.274646
4,10.0,0.274646
