In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

In [86]:
columns = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [87]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Data preparation

In [88]:
df_selected = df[columns]

In [89]:
def fill_with_zero(df, column):
    df[column] = df[column].fillna(0)
    return df

In [90]:
df = fill_with_zero(df_selected, columns)

df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]

# Question 1

In [91]:
df["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Question 2


In [92]:
corr = df[df.select_dtypes(include=['int','float']).columns].corr()
# get biggest correlation
corr = corr.abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()

In [93]:
corr[12:]

Unnamed: 0,level_0,level_1,0
12,total_bedrooms,households,0.966507
13,households,total_bedrooms,0.966507
14,longitude,latitude,0.924664
15,latitude,longitude,0.924664
16,total_bedrooms,total_rooms,0.920196
...,...,...,...
139,total_bedrooms,rooms_per_household,0.002717
140,longitude,population_per_household,0.002476
141,population_per_household,longitude,0.002476
142,population_per_household,latitude,0.002366


### Make median_house_value binary


In [94]:
df["above_average"] = (df["median_house_value"] > df["median_house_value"].mean()).astype(int)

In [95]:
df["above_average"]

0        1
1        1
2        1
3        1
4        1
        ..
20635    0
20636    0
20637    0
20638    0
20639    0
Name: above_average, Length: 20640, dtype: int32

### Split the data


In [96]:
df = df.drop(columns=["median_house_value"])


df_train, df_test = train_test_split(df, test_size=0.4, random_state=42)
df_valid, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

# Question 3