In [220]:
import pandas as pd
import numpy as np

df = pd.read_csv('housing.csv')
df = df[["latitude", "longitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", 
	"households", "median_income", "median_house_value", "ocean_proximity"]].copy()
df = df.fillna(0).copy()
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]
df["ocean_proximity"].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

In [221]:
corr_mat = df.drop(columns=['ocean_proximity']).corr().unstack().sort_values().drop_duplicates()
corr_mat[corr_mat != 1].head(3), corr_mat[corr_mat != 1].tail(3)

(latitude           longitude             -0.924664
 median_income      bedrooms_per_room     -0.573836
 bedrooms_per_room  rooms_per_household   -0.387465
 dtype: float64,
 total_rooms     households        0.918484
                 total_bedrooms    0.920196
 total_bedrooms  households        0.966507
 dtype: float64)

In [222]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

df.loc[df['median_house_value'] > df['median_house_value'].median(), 'above_average'] = 1
df.loc[df['median_house_value'] <= df['median_house_value'].median(), 'above_average'] = 0
X = df.drop(columns=['median_house_value', 'above_average'])
y = df['above_average']
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)
score = mutual_info_score(X_train.ocean_proximity, y_train)
round(score, 2)


0.13

In [223]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

X = df.drop(columns=['median_house_value', 'above_average'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [225]:
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
round(df_pred.correct.mean(), 2)

0.83