# Let's Build a Model!

# Custom Transformer

We want to create two new features for our data set: 

- Within 10 miles of Los Angeles (`1=yes`, `0=no`)
- Within 10 miles of San Francisco (`1=yes`, `0=no`)

In [7]:
from math import radians
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import haversine_distances
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Import data into DataFrame
data = fetch_california_housing()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

# Custom transformer for 'Latitude' and 'Longitude' cols
class NearCity(BaseEstimator, TransformerMixin):
    def __init__(self, distance=10):
        self.la = (34.05, -118.24)
        self.sf = (37.77, -122.41)
        self.distance = distance #See http://tinyurl.com/y8m263h3
        
    def calc_dist(self, coords_1, coords_2):
        coords_1 = [radians(c) for c in coords_1]
        coords_2 = [radians(c) for c in coords_2]
        result = haversine_distances([coords_1, coords_2])[0,-1]
        return result * 6_371
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        dist_to_sf = np.apply_along_axis(self.calc_dist, 1, X, coords_2=self.sf)
        dist_to_sf = (dist_to_sf < self.distance).astype(int)
        
        dist_to_la = np.apply_along_axis(self.calc_dist, 1, X, coords_2=self.la)
        dist_to_la = (dist_to_la < self.distance).astype(int)
        
        X_trans = np.column_stack((X, dist_to_sf, dist_to_la))
        return X_trans

In [15]:
other_cols = X.columns[:-2].tolist()

In [20]:
ct = ColumnTransformer(remainder='drop',
                       transformers = [
                        ('near_city', NearCity(), ['Latitude', 'Longitude'])#,
                        ('scaler', StandardScaler(), other_cols)]
                       )

In [21]:
X_trans = ct.fit_transform(X)

In [22]:
print(X.shape)
print(X_trans.shape)

(20640, 8)
(20640, 10)


In [27]:
pd.DataFrame(X_trans).describe().loc[['mean', 'std']]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
mean,35.631861,-119.569704,0.028488,0.049661,6.6097e-17,5.508083e-18,6.6097e-17,-1.060306e-16,-1.101617e-17,3.442552e-18
std,2.135952,2.003532,0.166367,0.217249,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024
