In [None]:
import pandas as pd

def load_data():
    return pd.read_csv("../datasets/housing/housing.csv")
housingData = load_data()
housingData.head()

In [None]:
housingData.info()

In [None]:
housingData["ocean_proximity"].value_counts()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housingData.hist(bins = 50, figsize=(20,15))
plt.show()

In [None]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housingData, 0.2)

In [None]:
housingData["income_cat"] = np.ceil(housingData["median_income"]  / 1.5)
housingData["income_cat"].where(housingData["income_cat"] < 5, 5.0, inplace = True)

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housingData, housingData["income_cat"]):
    strat_train_set = housingData.loc[train_index]
    strat_test_set = housingData.loc[test_index]
for set in (strat_test_set, strat_train_set):
    set.drop(["income_cat"], axis = 1, inplace = True)

In [None]:
housing = strat_train_set.copy()
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.1)

In [None]:
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4,
             s = housing["population"] / 100, label ="population",
             c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True
             )
plt.legend()

In [None]:
# compute pairwise corelation of columns

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["population"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [None]:
# calculate a median for all attributes and fill in the median in empty rows
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis = 1)
imputer.fit(housing_num)
imputer.statistics_

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
# transform the text attributes in numerical attributes

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
encoder.classes_

In [None]:
# transform the ocean proximity attribute in multiple attributes with values of 0 and 1 (onehotencoder)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

# or to do  both  of the previous processes in one go
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
encoder.fit_transform(housing_cat)