In [59]:
from sklearn.metrics.pairwise import rbf_kernel

from preprocessing import numeric_pipeline, cat_pipe
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

housing_data = pd.read_csv(Path('datasets/housing/housing.csv'))

In [60]:
housing_data.dropna(inplace=True)

In [61]:
housing_data['income_cat'] = pd.cut(housing_data['median_income'],
                                    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                    labels=[1,2,3,4,5]
                                    )

test, train = train_test_split(housing_data, test_size=0.2,
                               stratify=housing_data['income_cat'],
                               random_state=42)

In [62]:
learning_house_data = train.drop('median_house_value', axis=1)
labels = train['median_house_value'].copy()

In [63]:
from sklearn.preprocessing import OrdinalEncoder
house_categorys = housing_data[['ocean_proximity']]

ordinal_encoder = OrdinalEncoder()
encoded_categorys = ordinal_encoder.fit_transform(house_categorys)
encoded_categorys

array([[3.],
       [3.],
       [3.],
       ...,
       [1.],
       [1.],
       [1.]])

In [64]:
df_test_encoded = pd.DataFrame({
    'ocean_proximity': [
        '<1H OCEAN',
        'ISLAND',
        'NEAR OCEAN',
        'NEAR BAY',
        'INLAND'
    ]
})


In [65]:
df_out = pd.DataFrame(ordinal_encoder.transform(df_test_encoded))
df_out

Unnamed: 0,0
0,0.0
1,2.0
2,4.0
3,3.0
4,1.0


In [66]:
index = df_test_encoded.index
columns = df_test_encoded.columns

In [67]:
from sklearn.preprocessing import MinMaxScaler
housing_num = housing_data.select_dtypes(include=[np.number])
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
housing_num_min_max = min_max_scaler.fit_transform(housing_num)

In [68]:
housing_num

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [69]:
housing_num_min_max

array([[-0.57768924,  0.13496281,  0.56862745, ..., -0.95888834,
         0.07933684,  0.80453276],
       [-0.57569721,  0.13071201, -0.21568627, ..., -0.62604835,
         0.07605412,  0.41649313],
       [-0.57968127,  0.12858661,  1.        , ..., -0.94211478,
        -0.06794389,  0.39010148],
       ...,
       [-0.37649402,  0.46439957, -0.37254902, ..., -0.85791811,
        -0.83447125, -0.6812343 ],
       [-0.39641434,  0.46439957, -0.33333333, ..., -0.88554514,
        -0.8114095 , -0.71257438],
       [-0.38047809,  0.45164718, -0.41176471, ..., -0.82601546,
        -0.73949325, -0.69319302]])

In [70]:
log_transform = FunctionTransformer(np.log, inverse_func=np.exp)
log_population = log_transform.transform(train[['population']])

In [71]:
from sklearn.metrics.pairwise import rbf_kernel
rbf_transformer = FunctionTransformer(rbf_kernel, kw_args=dict(Y=[[35.]], gamma=0.1))

age_simil_35 = rbf_transformer.transform(train[['housing_median_age']])
sf_coords = 37.7749, -122.41
sf_transform = FunctionTransformer(rbf_kernel, kw_args=dict(Y=[sf_coords], gamma=0.1))
sf_simil = sf_transform.transform(train[['latitude', 'longitude']])

In [72]:
from cluster import *
cluster_similarity = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
similarity = cluster_similarity.fit_transform(train[['latitude', 'longitude']])



In [73]:
similarity

array([[8.00052491e-01, 2.32799591e-15, 1.52751953e-12, ...,
        4.67424245e-18, 1.05314267e-02, 4.10323840e-20],
       [1.11779639e-11, 9.07179526e-01, 4.06126602e-01, ...,
        5.92297996e-02, 9.07160198e-06, 2.22226304e-01],
       [5.70298327e-02, 2.64182980e-07, 3.04780647e-06, ...,
        1.59984438e-10, 8.25427175e-01, 8.30121806e-11],
       ...,
       [5.65881215e-13, 9.95477694e-01, 3.24085148e-01, ...,
        8.73111452e-02, 1.15027250e-06, 4.13002272e-01],
       [9.85995978e-01, 1.27826060e-13, 4.87020518e-11, ...,
        2.83945808e-16, 4.25985942e-02, 4.06166361e-18],
       [3.55278297e-15, 8.13026654e-01, 1.64580457e-01, ...,
        1.19678087e-01, 3.05539359e-08, 7.85659600e-01]])