In [1]:
import sys
sys.path.append("../../")  # access to local modules

In [2]:
import pandas as pd
import numpy as np
import random
from scipy.stats import randint, uniform
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
# helper functions

from MLDemo import DataPulling
from MLDemo.DataVisualization import pipelines
from MLDemo.DataVisualization.cluster_similarity import ClusterSimilarity
from MLDemo.CustomRegressor.knn_transformer import KnnTransformer

In [4]:
OCEAN_PROXIMITY: str = "ocean_proximity"
MEDIAN_INCOME: str = "median_income"
MEDIAN_HOUSE_VALUE: str = "median_house_value"
TOTAL_ROOMS: str = "total_rooms"
HOUSING_MEDIAN_AGE: str = "housing_median_age"
HOUSEHOLDS: str = "households"
TOTAL_BEDROOMS: str = "total_bedrooms"
LATITUDE: str = "latitude"
LONGITUDE: str = "longitude"
POPULATION: str = "population"
INCOME_CAT: str = "housing_cat"
ROOMS_PER_HOUSE: str = "rooms_per_house"
BEDROOMS_RATIO: str = "bedrooms_ratio"
PEOPLE_PER_HOUSE: str = "people_per_house"
RANDOM_STATE: int = 42
BINS: list[float] = [0., 1.5, 3.0, 4.5, 6., np.inf]
N_SPLITS: int = 10
TEST_SIZE: float = 0.3

In [5]:
data_fields: list[tuple[str, pd.DataFrame]] = DataPulling.open_tgz(DataPulling.HOUSING)
housing: pd.DataFrame = data_fields[0][1]

In [6]:
# split to test and training data. 
# Median income is distributed equally in training and testing

housing[INCOME_CAT] = pd.cut(housing[MEDIAN_INCOME], bins = BINS, labels=range(1, len(BINS)))
train_set, test_set = train_test_split(housing, test_size=TEST_SIZE, stratify=housing[INCOME_CAT], random_state=RANDOM_STATE)

train_set.drop(INCOME_CAT, axis=1, inplace=True)
test_set.drop(INCOME_CAT, axis=1, inplace=True)

train_label = train_set[MEDIAN_HOUSE_VALUE].copy()
test_label = test_set[MEDIAN_HOUSE_VALUE].copy()

train_set.drop(MEDIAN_HOUSE_VALUE, axis=1, inplace=True)
test_set.drop(MEDIAN_HOUSE_VALUE, axis=1, inplace=True)

In [7]:
# preparation pipeline
cluster_simil: ClusterSimilarity = ClusterSimilarity(n_clusters=10, gamma=1., random_state=RANDOM_STATE)
default_num_pipeline: Pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

preprocessing: ColumnTransformer = ColumnTransformer([
        (BEDROOMS_RATIO, pipelines.ratio_pipeline(), [TOTAL_BEDROOMS, TOTAL_ROOMS]),
        (ROOMS_PER_HOUSE, pipelines.ratio_pipeline(), [TOTAL_ROOMS, HOUSEHOLDS]),
        (PEOPLE_PER_HOUSE, pipelines.ratio_pipeline(), [POPULATION, HOUSEHOLDS]),
        ("log", pipelines.log_pipeline(), [TOTAL_BEDROOMS, TOTAL_ROOMS, POPULATION, HOUSEHOLDS, MEDIAN_INCOME]),
        ("geo", cluster_simil, [LATITUDE, LONGITUDE]),
        ("cat", pipelines.cat_pipeline(), make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)


In [8]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("SVM", SVR())
])

selector_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("selector", SelectFromModel(estimator=RandomForestRegressor(), threshold = 0.005)),
    ("SVM", SVR())
])

In [9]:
full_pipeline.fit(train_set, train_label)

selector_pipeline.fit(train_set, train_label)

In [10]:
residuals = full_pipeline.predict(train_set)
print(f"residual default parameters: {root_mean_squared_error(train_label, residuals)}")
predictions = full_pipeline.predict(test_set)
print(f"predicted errors default parameters: {root_mean_squared_error(test_label, predictions)}")


residual default parameters: 118220.48874643087
predicted errors default parameters: 118567.5630319671


In [11]:
# looking for hyper parameters to tune
for k, v in full_pipeline.get_params().items():
    if isinstance(v, bool) or isinstance(v, float) or isinstance(v, int) or "kernel" in k:
        print(f"{k}: {v} ({type(v)})")

verbose: False (<class 'bool'>)
preprocessing__force_int_remainder_cols: True (<class 'bool'>)
preprocessing__remainder__verbose: False (<class 'bool'>)
preprocessing__remainder__simpleimputer__add_indicator: False (<class 'bool'>)
preprocessing__remainder__simpleimputer__copy: True (<class 'bool'>)
preprocessing__remainder__simpleimputer__keep_empty_features: False (<class 'bool'>)
preprocessing__remainder__simpleimputer__missing_values: nan (<class 'float'>)
preprocessing__remainder__standardscaler__copy: True (<class 'bool'>)
preprocessing__remainder__standardscaler__with_mean: True (<class 'bool'>)
preprocessing__remainder__standardscaler__with_std: True (<class 'bool'>)
preprocessing__sparse_threshold: 0.3 (<class 'float'>)
preprocessing__verbose: False (<class 'bool'>)
preprocessing__verbose_feature_names_out: True (<class 'bool'>)
preprocessing__bedrooms_ratio__verbose: False (<class 'bool'>)
preprocessing__bedrooms_ratio__simpleimputer__add_indicator: False (<class 'bool'>)
pre

In [12]:
param_grid = [
    {
        "SVM__kernel": ["rbf"],
        "preprocessing__geo__n_clusters": [5, 8, 10, 15, 20, 25, 50],
        "SVM__C": [1.0, 2.0, 4.0, 10.0],
    },
    {
        "SVM__kernel": ["linear"],
        "preprocessing__geo__n_clusters": [5, 8, 10, 15, 20, 25, 50],
        "SVM__C": [1.0, 2.0, 4.0, 10.0]
    }
]

In [13]:
# grid search
#grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring="neg_root_mean_squared_error")
#grid_search.fit(train_set, train_label)

In [14]:
#print(grid_search.best_params_)

In [15]:
#final_model = grid_search.best_estimator_
#final_residuals = final_model.predict(train_set)
#final_predictions = final_model.predict(test_set)
#print(f"residual final: {root_mean_squared_error(train_label, final_residuals)}")
#print(f"predicted errors final: {root_mean_squared_error(test_label, final_predictions)}")

In [16]:
param_distribution = {
    "SVM__kernel": ["rbf", "linear"],
    "preprocessing__geo__n_clusters": randint(low=5, high=50),
    "SVM__C": uniform(loc=1.0, scale=9.0)
}

In [17]:
# rnd_search = RandomizedSearchCV(full_pipeline, param_distribution, n_iter=10, cv=3, scoring="neg_root_mean_squared_error")
# rnd_search.fit(train_set, train_label)

In [18]:
# print(rnd_search.best_params_)

In [19]:
# rnd_model = rnd_search.best_estimator_
# rnd_residuals = rnd_model.predict(train_set)
# rnd_predictions = rnd_model.predict(test_set)
# print(f"Residual rnd: {root_mean_squared_error(train_label, rnd_residuals)}")
# print(f"predicted errors rnd: {root_mean_squared_error(test_label, rnd_predictions)}")

In [20]:
# rnd_search2 = RandomizedSearchCV(selector_pipeline, param_distribution, n_iter=10, cv=3, scoring="neg_root_mean_squared_error")
# rnd_search2.fit(train_set, train_label)

In [21]:
# print(rnd_search2.best_params_)

In [22]:
# rnd_model2 = rnd_search2.best_estimator_
# rnd_residuals2 = rnd_model2.predict(train_set)
# rnd_predictions2 = rnd_model2.predict(test_set)
# print(f"Residual rnd: {root_mean_squared_error(train_label, rnd_residuals2)}")
# print(f"predicted errors rnd: {root_mean_squared_error(test_label, rnd_predictions2)}")

In [23]:
preprocessing_knn: ColumnTransformer = ColumnTransformer([
        (BEDROOMS_RATIO, pipelines.ratio_pipeline(), [TOTAL_BEDROOMS, TOTAL_ROOMS]),
        (ROOMS_PER_HOUSE, pipelines.ratio_pipeline(), [TOTAL_ROOMS, HOUSEHOLDS]),
        (PEOPLE_PER_HOUSE, pipelines.ratio_pipeline(), [POPULATION, HOUSEHOLDS]),
        ("log", pipelines.log_pipeline(), [TOTAL_BEDROOMS, TOTAL_ROOMS, POPULATION, HOUSEHOLDS, MEDIAN_INCOME]),
        ("geo", cluster_simil, [LATITUDE, LONGITUDE]),
        ("cat", pipelines.cat_pipeline(), make_column_selector(dtype_include=object)),
        ("knn", pipelines.knn_pipeline(), [LATITUDE, LONGITUDE])
    ],
    remainder=default_num_pipeline)