In [286]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Dataset

In [287]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-26 18:16:28--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.8’


2022-09-26 18:16:28 (34.7 MB/s) - ‘housing.csv.8’ saved [1423529/1423529]



In [288]:
df = pd.read_csv('housing.csv')
len(df)

20640

In [289]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [290]:
df_ref = df.copy()

# Features

In [291]:
df = df[['latitude',
          'longitude',
          'housing_median_age',
          'total_rooms',
          'total_bedrooms',
          'population',
          'households',
          'median_income',
          'median_house_value',
          'ocean_proximity',]]

In [292]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [293]:
df = df.fillna(0)

In [294]:
df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [295]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [296]:
df.head().T

Unnamed: 0,0,1,2,3,4
latitude,37.88,37.86,37.85,37.85,37.85
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY,NEAR BAY


# Question 1

Most frequent value for ocean_proximity.

In [297]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

Choses <1H OCEAN 

# Question 2

Features with biggest correlation.


In [298]:
print("total_bedrooms and households =",df.total_bedrooms.corr(df.households))

total_bedrooms and households = 0.9665072400420388


In [299]:
print("total_bedrooms and total_rooms =",df.total_bedrooms.corr(df.total_rooms))

total_bedrooms and total_rooms = 0.9201961721166267


In [300]:
print("population and households =",df.population.corr(df.households))

population and households = 0.9072222660959613


In [301]:
print("population_per_household and total_rooms =",df.population_per_household.corr(df.total_rooms))

population_per_household and total_rooms = -0.02458065899398802


Choses total_bedrooms and households

# Make median_house_value binary

In [302]:
Avg = round(df.median_house_value.mean(),2)
Avg

206855.82

In [303]:
df['above_average'] = (df['median_house_value'] > Avg).astype(int)

In [304]:
df.above_average.head()

0    1
1    1
2    1
3    1
4    1
Name: above_average, dtype: int64

In [305]:
del df['median_house_value']

# Split the data

In [306]:
df_full_train , df_test = train_test_split(df , test_size = 0.2 , random_state= 42)
df_train , df_val = train_test_split(df_full_train , test_size = 0.25 , random_state= 42)
len(df_full_train), len(df_val), len(df_test)

(16512, 4128, 4128)

In [307]:
df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
17244,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011,1
8817,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082,1
19686,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059,0
3545,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016,1
17019,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456,1


In [308]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [309]:
df_train.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,34.43,-119.67,39.0,1467.0,381.0,1404.0,374.0,2.3681,<1H OCEAN,3.92246,0.259714,3.754011,1
1,33.74,-118.32,24.0,6097.0,794.0,2248.0,806.0,10.1357,NEAR OCEAN,7.564516,0.130228,2.789082,1
2,39.13,-121.62,41.0,1317.0,309.0,856.0,337.0,1.6719,INLAND,3.908012,0.234624,2.540059,0
3,34.24,-118.63,9.0,4759.0,924.0,1884.0,915.0,4.8333,<1H OCEAN,5.201093,0.194158,2.059016,1
4,37.52,-122.3,38.0,2769.0,387.0,994.0,395.0,5.5902,NEAR OCEAN,7.010127,0.139762,2.516456,1


In [310]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

# Question 3

In [311]:
round(mutual_info_score(df_train.ocean_proximity,df_train.above_average),2)

0.1

# Question 4

In [312]:
train_dicts = df_train.drop(columns= 'above_average').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.84

# Question 5

In [313]:
feature = ['total_rooms','total_bedrooms','population','households']

In [314]:
train_dicts = df_train[feature].to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.71

In [315]:
train_dicts = df_train[feature].drop(columns = 'total_rooms').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
print("Acc =",(y_val_pred == y_val).mean().round(2))
print("diff =",round(0.71-(y_val_pred == y_val).mean(),2))

Acc = 0.63
diff = 0.08


In [316]:
train_dicts = df_train[feature].drop(columns = 'total_bedrooms').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
print("Acc =",(y_val_pred == y_val).mean().round(2))
print("diff =",round(0.71-(y_val_pred == y_val).mean(),2))

Acc = 0.66
diff = 0.05


In [317]:
train_dicts = df_train[feature].drop(columns = 'population').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
print("Acc =",(y_val_pred == y_val).mean().round(2))
print("diff =",round(0.71-(y_val_pred == y_val).mean(),2))

Acc = 0.66
diff = 0.05


In [318]:
train_dicts = df_train[feature].drop(columns = 'households').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
print("Acc =",(y_val_pred == y_val).mean().round(2))
print("diff =",round(0.71-(y_val_pred == y_val).mean(),2))

Acc = 0.67
diff = 0.04


# Question 6

In [319]:
df = df_ref.copy()

In [320]:
df.median_house_value = np.log(df.median_house_value)

In [321]:
df_full_train , df_test = train_test_split(df , test_size = 0.2 , random_state= 42)
df_train , df_val = train_test_split(df_full_train , test_size = 0.25 , random_state= 42)

In [322]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [323]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [324]:
train_dicts = df_train.drop(columns= 'median_house_value').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [331]:
alpha = [0, 0.01, 0.1, 1, 10]

In [336]:
for a in alpha:
    
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train,y_train)
    
    y_val_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val,y_val_pred))
    print("alpha =",a,"RMSE =", round(rmse,3))

alpha = 0 RMSE = 0.525
alpha = 0.01 RMSE = 0.525
alpha = 0.1 RMSE = 0.525
alpha = 1 RMSE = 0.525
alpha = 10 RMSE = 0.525
