In [1]:
import pandas as pd
from pathlib import Path
import tarfile
import urllib.request
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import cartopy.crs as ccrs
import cartopy.feature  as cfeature
import geopandas as gpd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,\
    RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,\
    MinMaxScaler,StandardScaler, FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.compose import TransformedTargetRegressor,make_column_selector,make_column_transformer,ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import binom
from sklearn import set_config
from sklearn.metrics import r2_score,root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint, expon, loguniform
from scipy import stats
import joblib
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import MetaEstimatorMixin, clone
from sklearn.utils.estimator_checks import check_estimator

In [2]:
def load_housing_data():
    tarball_path = Path('datasets/housing.tgz')
    if not tarball_path.is_file():
        Path('datasets').mkdir(parents=True, exist_ok=True)
        url = 'https://github.com/ageron/data/raw/main/housing.tgz' # datapatH
        urllib.request.urlretrieve(url,tarball_path)                # gets the data and puts it in the cheked path
        with tarfile.open(tarball_path) as housing_tarball:         # read the data the data from the checked path
            housing_tarball.extractall(path='datasets')             # extract the data into the checked folder
    return pd.read_csv(Path('datasets/housing/housing.csv'))        # pd read the file 

housing  = load_housing_data()

  housing_tarball.extractall(path='datasets')             # extract the data into the checked folder


In [3]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing['income_cats'] = pd.cut(housing['median_income'], bins = [0,1.5,3,4.5,6,np.inf], labels=[1,2,3,4,5])
housing['combined'] =housing.apply(lambda x: '_'.join(x.values.astype(str)), axis=1)

htrain,htest = train_test_split(housing, test_size=.2,stratify=housing['income_cats'], random_state=42)

for set_ in (htrain, htest):
    set_.drop(['combined', 'income_cats'], axis = 1, inplace=True)

In [4]:
housing = htrain.drop('median_house_value',axis=1)
housing_label = htrain['median_house_value'].copy()

In [5]:
# Custom Transformers
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    """ The fit method identifies clusters using KMeans clustering and the transform method checks 
    for similiratity between a data point and against all the clusters identified by the KMeans clustering fit method
    
    Args:
        BaseEstimator (_type_): _description_
        TransformerMixin (_type_): _description_
    """
    def __init__(self, n_clusters = 10, gamma = .1, random_state= None) -> None:
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    def fit(self, X, y = None, sample_weight = None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names = None):
            return [f'Cluster {i} similarity' for i in range(self.n_clusters)]
        
## using custom tranformer
cluster_simil = ClusterSimilarity(n_clusters = 10, gamma=1., random_state=42)
similarity = cluster_simil.fit_transform(housing[['latitude','longitude']],sample_weight = housing_label)

similarity[:3].round(2)

array([[0.  , 0.98, 0.  , 0.  , 0.  , 0.  , 0.13, 0.55, 0.  , 0.56],
       [0.64, 0.  , 0.11, 0.04, 0.  , 0.  , 0.  , 0.  , 0.99, 0.  ],
       [0.  , 0.65, 0.  , 0.  , 0.01, 0.  , 0.49, 0.59, 0.  , 0.28]])

In [6]:
# pipeline building

def num_pipeline():
    return make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
)

def cat_pipeline():
    return make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(handle_unknown='ignore')
)

def column_ratio(X):
    return X[:,[0]]/X[:,[1]]

def ratio_name(function_tranformer, feature_names_in):
    return ['ratio'] #feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy='median'),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )
    
def log_pipeline():
    return make_pipeline(
        SimpleImputer(strategy='median'),
        FunctionTransformer(np.log, feature_names_out='one-to-one',inverse_func=np.exp),
        StandardScaler()
    )
    
def cluster_simil_pipeline():
    cluster_simil = ClusterSimilarity(n_clusters=10, gamma = 1, random_state=42)
    return cluster_simil
    
preprocessing = ColumnTransformer([
    ('bedroom',ratio_pipeline(), ['total_bedrooms', 'total_rooms']),
    ('rooms_per_house',ratio_pipeline(), ['total_rooms', 'households']),
    ('people_per_house',ratio_pipeline(), ['population', 'households']),
    ('log',log_pipeline(), ['total_bedrooms', 'total_rooms', 'population','households','median_income']),
    ('geo',cluster_simil_pipeline(), ['latitude', 'longitude']),
    ('cat',cat_pipeline(), make_column_selector(dtype_include=object)),
    ],
                                  remainder=num_pipeline()
)


housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

(16512, 24)

In [16]:
# 4 custom transformer

class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):
    def __init__(self, estimator) -> None:
        self.estimator = estimator
        
    
    def fit(self, X, y=None):
        estimator_ = clone(self.estimator) #creates a clone of the original estimator to ensure the original estimator is not modified
        estimator_.fit(X, y) # fits the cloned estimator with input features X and target value y
        self.estimator_ = estimator_ # stores the fitted estimator in the class instsance
        self.n_features_in_ = self.estimator_.n_features_in_ # stores the number of features used by the fitted estimator
        if hasattr(self.estimator, 'feature_names_in'): # checks if the estimator has a 'feature_names_in_' attribute
            self.feature_names_in_ = self.estimator.feature_names_in_ # if it has the variable it stores in the instance variable
        return self # always returns self
        
    
    def transform(self, X):
        check_is_fitted(self) # checks if the transformer id fitted before calling transform
        predictions = self.estimator_.predict(X) # uses the fitted estimator to make predictions based on input featuers X
        if predictions.ndim == 1:  # checks if the predictions are 1-dimensional array (which can happen if the there is only one dimensional array)
            predictions = predictions.reshape(-1,1) # if  1D reshape the array
        return predictions  # return the predictions, which are now treated as new features to be used in subsequent steps of the pipeline
    
    
    def get_feature_names_out(self, names=None): 
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, 'n_outputs_', 1) #Retrieves the number of outputs from the fitted estimator using 'n_outputs' attribute
        estimator_class_name = self.estimator_.__class__.__name__ #Gets the class name of the estimator
        estimator_short_name = estimator_class_name.lower().replace('_', '') # Converts the class name into lower case and removes underscores to create a standardized name for the estimator
        return [f'{estimator_short_name}_prediction_{i}' for i in range(n_outputs)]
    

    

In [17]:
check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

In [20]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights='distance')
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[['latitude', 'longitude']]
knn_transformer.fit_transform(geo_features, housing_label)

array([[486100.66666667],
       [435250.        ],
       [105100.        ],
       ...,
       [148800.        ],
       [500001.        ],
       [234333.33333333]])

In [21]:
knn_transformer.get_feature_names_out()

['kneighborsregressor_prediction_0']

In [28]:
transformers = [(name, clone(transformer), columns) for name , transformer, columns in preprocessing.transformers]
geo_index = [name for name, _,_ in transformers].index('geo')
transformers[geo_index] = ('geo', knn_transformer, ['latitude', 'longitude'])

new_geo_preprocessing = ColumnTransformer(transformers)