In [1]:
import pandas as pd
import numpy as np

In [3]:
house_df = pd.read_csv("housing.csv")
house_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
useful_cols = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [8]:
house_df = house_df[useful_cols]
house_df.shape

(20640, 10)

In [14]:
house_df.fillna(0, inplace=True)

In [15]:
house_df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [16]:
house_df['rooms_per_household'] = house_df.total_rooms / house_df.households
house_df['bedrooms_per_room'] = house_df.total_bedrooms / house_df.total_rooms
house_df['population_per_household'] = house_df.population / house_df.households

In [17]:
house_df.shape

(20640, 13)

## Question 1

In [18]:
house_df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

## split the data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
y = house_df['median_house_value']
X = house_df.drop('median_house_value', axis=1)

In [22]:
X.shape

(20640, 12)

In [25]:
df_full, df_test, y_full, y_test = train_test_split(X,y,test_size=0.2,random_state=42 )
df_train, df_val, y_train, y_val = train_test_split(df_full, y_full, test_size=0.25, random_state=42)

## Question 2

In [30]:
numerical_cols = list(df_train.dtypes[df_train.dtypes != "object"].index)
numerical_cols 

['latitude',
 'longitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [31]:
categorical_cols = [ col for col in df_train.columns if col not in numerical_cols]
categorical_cols

['ocean_proximity']

In [32]:
numerical_features = df_train[numerical_cols]
numerical_features

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
17244,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,3.922460,0.259714,3.754011
8817,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,7.564516,0.130228,2.789082
19686,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,3.908012,0.234624,2.540059
3545,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,5.201093,0.194158,2.059016
17019,37.52,-122.30,38.0,2769.0,387.0,994.0,395.0,5.5902,7.010127,0.139762,2.516456
...,...,...,...,...,...,...,...,...,...,...,...
5606,33.79,-118.29,16.0,1867.0,571.0,951.0,498.0,3.3427,3.748996,0.305838,1.909639
16339,38.04,-121.34,16.0,3295.0,565.0,2279.0,576.0,3.6083,5.720486,0.171472,3.956597
14965,32.74,-116.99,18.0,3341.0,611.0,1952.0,602.0,3.9844,5.549834,0.182879,3.242525
11117,33.84,-117.87,16.0,1545.0,354.0,730.0,350.0,4.5112,4.414286,0.229126,2.085714


In [33]:
matrix = numerical_features.corr()
matrix

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.925005,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
longitude,-0.925005,1.0,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
housing_median_age,0.002477,-0.099812,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,-0.025914,0.036449,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,-0.05973,0.06384,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,-0.100272,0.09167,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,-0.063529,0.049762,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.076805,-0.016426,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,0.119118,-0.034814,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,-0.124507,0.10232,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


Total_bedrooms and household have the biggest correlation. 

## make median house value binary

In [36]:
y_train.mean()

206807.7419250646

In [35]:
y_train

17244    241400.0
8817     500001.0
19686     64100.0
3545     277200.0
17019    417000.0
           ...   
5606     154200.0
16339    146400.0
14965    215300.0
11117    139000.0
8472     181300.0
Name: median_house_value, Length: 12384, dtype: float64

In [38]:
above_average = (y_train > y_train.mean()).astype(int)
above_average

17244    1
8817     1
19686    0
3545     1
17019    1
        ..
5606     0
16339    0
14965    1
11117    0
8472     0
Name: median_house_value, Length: 12384, dtype: int32

## Question 3

In [39]:
from sklearn.metrics import mutual_info_score

In [41]:
round(mutual_info_score(df_train.ocean_proximity, above_average), 2)

0.1

## Question 4

In [42]:
from sklearn.feature_extraction import DictVectorizer

In [44]:
df_train.shape

(12384, 12)

In [45]:
df_val.shape

(4128, 12)

In [46]:
dv = DictVectorizer(sparse=False)

In [47]:
train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient="records")
X_val = dv.transform(val_dict)

In [48]:
y_train = (y_train > y_train.mean()).astype(int)
y_val = (y_val > y_val.mean()).astype(int)

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [51]:
model.fit(X_train, y_train)

In [52]:
y_pred = model.predict(X_val)

In [53]:
from sklearn.metrics import accuracy_score

In [55]:
original_score = accuracy_score(y_val,y_pred)
round(original_score,2)

0.84

## Question 5

In [56]:
exclude_feats = ["total_rooms","total_bedrooms","population","households"]

In [58]:
results = []

for feature in exclude_feats: 
    dv = DictVectorizer(sparse=False)

    train_frame = df_train.drop(feature, axis=1)
    train_dict = train_frame.to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)

    val_frame = df_val.drop(feature, axis=1)
    val_dict = val_frame.to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_val)
    feat_score = accuracy_score(y_val, y_predict)
    difference = abs(original_score - feat_score)
    results.append(difference)

result = pd.DataFrame(results, columns=['difference'], index=exclude_feats)

In [61]:
result.sort_values(by="difference")

Unnamed: 0,difference
total_bedrooms,0.0
total_rooms,0.00218
households,0.003634
population,0.008963


## Question 6

In [62]:
y = house_df['median_house_value']
X = house_df.drop('median_house_value', axis=1)

In [63]:
y_transformed = np.log1p(y)

In [64]:
df_full, df_test, y_full, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42 )
df_train, df_val, y_train, y_val = train_test_split(df_full, y_full, test_size=0.25, random_state=42)

In [69]:
alpha_values = [0, 0.01, 0.1, 1, 10]

In [70]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
results = []

for value in alpha_values:
    dv = DictVectorizer(sparse=False)

    train_dict = df_train.to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.to_dict(orient="records")
    X_val = dv.transform(val_dict)

    model = Ridge(alpha=value, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_prediction = model.predict(X_val)
    rmse_val = np.sqrt(mean_squared_error(y_val, y_prediction))
    results.append(round(rmse_val, 3))

result = pd.DataFrame(results, columns=["RMSE"], index=alpha_values)    
    

In [71]:
result

Unnamed: 0,RMSE
0.0,0.524
0.01,0.524
0.1,0.524
1.0,0.524
10.0,0.524
