In [66]:
import pandas as pd
import numpy as np

df = pd.read_csv('housing.csv')
df = df[["latitude", "longitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", 
	"households", "median_income", "median_house_value", "ocean_proximity"]].copy()
df = df.fillna(0).copy()
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]
df["ocean_proximity"].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

In [67]:
corr_mat = df.drop(columns=['ocean_proximity']).corr().unstack().sort_values().drop_duplicates()
corr_mat[corr_mat != 1].head(3), corr_mat[corr_mat != 1].tail(3)

(latitude           longitude             -0.924664
 median_income      bedrooms_per_room     -0.573836
 bedrooms_per_room  rooms_per_household   -0.387465
 dtype: float64,
 total_rooms     households        0.918484
                 total_bedrooms    0.920196
 total_bedrooms  households        0.966507
 dtype: float64)

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

df.loc[df['median_house_value'] > df['median_house_value'].median(), 'above_average'] = 1
df.loc[df['median_house_value'] <= df['median_house_value'].median(), 'above_average'] = 0
X = df.drop(columns=['median_house_value', 'above_average'])
y = df['above_average'].values
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
score = mutual_info_score(X_train.ocean_proximity, y_train)
round(score, 2)


0.12

In [69]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

X = df.drop(columns=['median_house_value', 'above_average'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [70]:
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
orig_acc = df_pred.correct.mean()
round(orig_acc, 2)

0.83

In [71]:
X = df.drop(columns=['median_house_value', 'above_average', 'latitude'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


0.0004844961240310086

In [72]:
X = df.drop(columns=['median_house_value', 'above_average', 'longitude'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


0.0

In [73]:
X = df.drop(columns=['median_house_value', 'above_average', 'housing_median_age'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


-0.002664728682170603

In [74]:
X = df.drop(columns=['median_house_value', 'above_average', 'total_rooms'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
total_rooms = df_pred.correct.mean() - orig_acc

In [75]:
X = df.drop(columns=['median_house_value', 'above_average', 'total_bedrooms'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
total_bedrooms = df_pred.correct.mean() - orig_acc


In [76]:
X = df.drop(columns=['median_house_value', 'above_average', 'population'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
population = df_pred.correct.mean() - orig_acc


In [77]:
X = df.drop(columns=['median_house_value', 'above_average', 'households'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
households = df_pred.correct.mean() - orig_acc


In [78]:
X = df.drop(columns=['median_house_value', 'above_average', 'median_income'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


-0.05813953488372092

In [79]:
X = df.drop(columns=['median_house_value', 'above_average', 'median_house_value'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


0.0

In [80]:
X = df.drop(columns=['median_house_value', 'above_average', 'ocean_proximity'])
y = df['above_average'].values
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:,1]
greater_than = (y_pred > 0.5)
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = greater_than.astype(int)
df_pred['actual'] = y_val
df_pred = df_pred.fillna(0)
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean() - orig_acc


-0.02422480620155043

In [81]:
total_rooms, total_bedrooms, population, households

(0.0043604651162790775,
 0.0009689922480620172,
 -0.005329457364341095,
 -0.0031492248062016115)

In [82]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

X = df.drop(columns=['median_house_value', 'above_average'])
y = df['median_house_value'].values
y = np.log1p(y)
dicts = X.to_dict(orient='records')
X = DictVectorizer(sparse=False).fit_transform(dicts)
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

model = Ridge(alpha=0, solver="sag", random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
round (np.sqrt(mean_squared_error(y_val, y_pred)), 3)

0.524

In [83]:
model = Ridge(alpha=0.01, solver="sag", random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
round (np.sqrt(mean_squared_error(y_val, y_pred)), 3)

0.524

In [84]:
model = Ridge(alpha=0.1, solver="sag", random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
round (np.sqrt(mean_squared_error(y_val, y_pred)), 3)

0.524

In [85]:
model = Ridge(alpha=1, solver="sag", random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
round (np.sqrt(mean_squared_error(y_val, y_pred)), 3)

0.524

In [86]:
model = Ridge(alpha=10, solver="sag", random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
round (np.sqrt(mean_squared_error(y_val, y_pred)), 3)

0.524