In [None]:
n_neighbours
weights
p - distance function

In [73]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

pd.set_option('display.max_rows', 100)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [63]:
house_data = pd.read_csv("../data/Housing_data/housing-classification-iter6.csv", index_col='Id')

X = house_data.drop(columns=['Expensive'])
y = house_data['Expensive']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)#, stratify=categoric_features)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [60]:
def plot_predictions(y_true, y_pred): 
    print(
        f"""
        RMSLE: {mean_squared_log_error(y_true, y_pred)**0.5}
        MSE: {mean_squared_error(y_true, y_pred)}
        RMSE: {mean_squared_error(y_true, y_pred)**0.5}
        MAE: {mean_absolute_error(y_true, y_pred)}
        """
    )
    max_preds = min([max(y_pred.tolist()), max(y_true.tolist())])
    min_preds = max([min(y_pred.tolist()), min(y_true.tolist())])
    print(max_preds, min_preds)
    # plot
    plt.figure(figsize=(8,8))
    sns.scatterplot(x=y_pred, y=y_true)
    sns.lineplot(x=[min_preds,max_preds], y=[min_preds, max_preds], color='red')
    plt.ylabel('Observed')
    plt.xlabel('Predicted')
    plt.show()

    errors = y_pred - y_true
    plt.subplots(figsize=(12, 8))
    sns.histplot(errors)
    plt.vlines(x=0, ymin=0, ymax=150, color='red')
    plt.show()

    p_df = (
        pd.DataFrame({'y_true':y_true, 'y_pred':y_pred})
        .assign(error = lambda x: x['y_pred'] - x['y_true'])
        .sort_values(by='y_true')
        )

    plt.subplots(figsize=(12, 8))
    sns.scatterplot(data=p_df, x='y_true', y='error')
    plt.hlines(y=0, xmin=0, xmax=max(p_df['y_true']), color='red')
    plt.show()

In [64]:
# Ordered categorical columns
ordinal_categorical_features  = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", "KitchenQual", 
                                 "FireplaceQu", "GarageQual", "GarageCond", "PoolQC","LotShape", "LandSlope",
                                 "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Functional", "GarageFinish", 
                                 "PavedDrive", "Utilities", "Electrical", "Fence"] 

categorical_features = list(X_train.select_dtypes(include=["object"]))
# Unordered categorical columns
nominative_categorical_features = list(set(categorical_features) - set(ordinal_categorical_features))

numerical_features = list(X_train.select_dtypes(exclude=["object"]))

ordinal_categories = [
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Reg", "IR1", "IR2", "IR3"],
    ["None", "Sev", "Mod", "Gtl"],
    ["None", "No", "Mn", "Av", "Gd"],
    ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    ["None", "Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    ["None", "Unf", "RFn", "Fin"],
    ["None", "N", "P", "Y"],
    ["None", "NoSeWa", "NoSewr", "AllPub"],
    ["None", "Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"]
]

In [65]:
# Create the scaling pipelines for categorical and numerical data
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])
one_hot_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="None")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="None")),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories)),
    ('scale', MinMaxScaler())
])

# Apply the scaling pipeline to both categorical and numerical columns
full_processor = ColumnTransformer(transformers=[
    ('numerical', numeric_pipeline, numerical_features), 
    ('one_hot', one_hot_pipeline, nominative_categorical_features), 
    ('ordinal', ordinal_pipeline, ordinal_categorical_features)
])

### KNeighbors

In [78]:
knn_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', KNeighborsClassifier(n_neighbors=6))
])

_ = knn_pipeline.fit(X_train, y_train)

training_predictions = knn_pipeline.predict(X_train)
# Plot the training data predictions
# plot_predictions(y_train, training_predictions)
print(f'RMSLE: {mean_squared_log_error(y_train, training_predictions)**0.5}')

RMSLE: 0.1756445469887147


### LogisticRegression

In [77]:
lr_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', LogisticRegression(random_state=0))
])

_ = lr_pipeline.fit(X_train, y_train)

training_predictions = knn_pipeline.predict(X_train)
# Plot the training data predictions
# plot_predictions(y_train, training_predictions)
print(f'RMSLE: {mean_squared_log_error(y_train, training_predictions)**0.5}')

RMSLE: 0.1756445469887147


### RandomForest

In [76]:
rf_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', RandomForestClassifier(max_depth=2, random_state=0))
])

_ = rf_pipeline.fit(X_train, y_train)

training_predictions = rf_pipeline.predict(X_train)
# Plot the training data predictions
# plot_predictions(y_train, training_predictions)
print(f'RMSLE: {mean_squared_log_error(y_train, training_predictions)**0.5}')

RMSLE: 0.20180022073989282


### Support Vector Machine

In [80]:
svc_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', SVC(gamma='auto'))
])

_ = svc_pipeline.fit(X_train, y_train)

training_predictions = svc_pipeline.predict(X_train)
# Plot the training data predictions
# plot_predictions(y_train, training_predictions)
print(f'RMSLE: {mean_squared_log_error(y_train, training_predictions)**0.5}')

# log_training_predictions = svc_pipeline.predict_log_proba(X_train)
# print(f'RMSLE: {mean_squared_log_error(y_train, log_training_predictions)**0.5}')

RMSLE: 0.20683341819717507
