In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
df = pd.read_csv('housing.csv')

In [4]:
col = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'ocean_proximity'
]

In [5]:
df = df[col]

In [7]:
df.fillna(0, inplace=True)

In [8]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

## Question 1

In [9]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

## Question 2

In [18]:
#households & total_bedrooms
df.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416,0.106389,-0.104112,0.002366
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967,-0.02754,0.084836,0.002476
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623,-0.153277,0.125396,0.013191
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153,0.133798,-0.174583,-0.024581
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148,0.002717,0.122205,-0.028019
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465,-0.072213,0.031397,0.069863
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843,-0.080598,0.059818,-0.027309
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075,0.326895,-0.573836,0.018766
median_house_value,-0.14416,-0.045967,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0,0.151948,-0.238759,-0.023737
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,0.151948,1.0,-0.387465,-0.004852


In [20]:
df['above_average'] = df['median_house_value'].apply(lambda x: 1 if x>df['median_house_value'].mean() else 0)

## Question 3

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('median_house_value', axis=1), df['median_house_value'], random_state=42, test_size=0.2)

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.25)

In [27]:
mutual_info_score(df.above_average, df.ocean_proximity)

0.10206817406620414

## Question 4

In [37]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
enc.fit(X_train.ocean_proximity.values.reshape(-1, 1))

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [46]:
X_train = pd.concat([
            X_train.reset_index(drop=True),
            pd.DataFrame(enc.transform(X_train.ocean_proximity.values.reshape(-1, 1)), columns=['ohe1', 'ohe2', 'ohe3', 'ohe4', 'ohe5']),
          ], axis=1
)   

In [53]:
X_val = pd.concat([
            X_val.reset_index(drop=True),
            pd.DataFrame(enc.transform(X_val.ocean_proximity.values.reshape(-1, 1)), columns=['ohe1', 'ohe2', 'ohe3', 'ohe4', 'ohe5']),
          ], axis=1
)   

In [48]:
X_train.drop('ocean_proximity', axis=1, inplace=True)

In [56]:
X_val.drop('ocean_proximity', axis=1, inplace=True)

In [49]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [51]:
model.fit(X_train.drop('above_average', axis=1), X_train.above_average)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [57]:
pred = model.predict(X_val.drop('above_average', axis=1))

In [60]:
accuracy_score(X_val.above_average, pred)

0.8357558139534884

## Question 5

In [62]:
for col in ['total_rooms', 'total_bedrooms', 'population', 'households']:
    print(col)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train.drop(['above_average', col], axis=1), X_train.above_average)
    pred = model.predict(X_val.drop(['above_average', col], axis=1))
    a = accuracy_score(X_val.above_average, pred)
    dif = 0.8357558139534884 - a
    print(a, dif)

total_rooms
0.8372093023255814 -0.0014534883720930258
total_bedrooms
0.8355135658914729 0.00024224806201555982
population
0.8263081395348837 0.009447674418604723
households
0.8333333333333334 0.002422480620155043


## Question 6

In [68]:
y_train = y_train.apply(lambda x: np.log(x+1))

In [74]:
y_val = y_val.apply(lambda x: np.log(x+1))

In [76]:
m = 1.0
a_best = 0
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train.drop(['above_average'], axis=1), y_train)
    pred = model.predict(X_val.drop(['above_average'], axis=1))
    rmse = mean_squared_error(y_val, pred, squared=False)
    print(a, rmse)
    if rmse < m:
        m = rmse
        a_best = a
print(a_best)

0 0.524063570701514
0.01 0.524063570718629
0.1 0.524063570881207
1 0.5240635725155535
10 0.5240635888333284
0
