In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


In [2]:
# Read the data
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')


In [3]:
# Data preparation
df = df[['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']]
df = df.fillna(0)   
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.isnull().sum()

df.fillna(0, inplace=True)


In [5]:
# Split the data
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)
len(df), (len(df_train) + len(df_val) + len(df_test))

y_train = df_train['median_house_value'].values
y_val = df_val['median_house_value'].values
y_test = df_test['median_house_value'].values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


In [6]:
# Question 1
df_train['ocean_proximity'].value_counts().index[0]


'<1h_ocean'

In [7]:
## Make median_house_value binary
df_train_full['above_average'] = (df_train_full['median_house_value'] > df_train_full['median_house_value'].mean()).astype(int)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)


In [8]:
# Question 2
corr_matrix = df_train.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)


median_house_value          1.000000
above_average               0.804277
median_income               0.689202
rooms_per_household         0.179062
total_rooms                 0.128442
housing_median_age          0.106235
households                  0.056161
total_bedrooms              0.041154
population_per_household   -0.030561
population                 -0.033653
longitude                  -0.044328
latitude                   -0.144455
bedrooms_per_room          -0.256201
Name: median_house_value, dtype: float64

In [9]:
# Question 3
mi = mutual_info_score(df_train['above_average'], df_train['ocean_proximity'])
round(mi, 2)


0.57

In [10]:
# Question 4
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = (y_pred == y_val).mean()
round(accuracy, 2)


0.06

In [None]:
features = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 'ocean_proximity']
scores = []
for feature in features:
    train_dict = df_train.drop(feature, axis=1).to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.drop(feature, axis=1).to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    accuracy = (y_pred == y_val).mean()
    scores.append(accuracy)

scores = np.array(scores)
scores

scores - accuracy
