In this notebook I will try to connect models/predictions from machine learning and dense neural networks. 
## Plan
1. Prepare data
2. Get models
3. Try averaging predictions
4. Use model to predict base on the predictions made both by ml and dnn

In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.base import BaseEstimator, RegressorMixin
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.utils import get_custom_objects
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor

In [81]:
#1. First dataframe is a dataset cleaned with new features used earlier with ml models and second one is just cleaned dataset without new features
dataset_new_features = pd.read_csv(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\data\preprocessed\dataset_with_new_features_rep.csv")
dataset_cleaned = pd.read_csv(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\data\preprocessed\entire_prep_dataset.csv")
train_raw = pd.read_csv(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\data\raw\train.csv")
test_raw = pd.read_csv(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\data\raw\test.csv")

In [82]:
# pipeline
def my_pipeline(df):
    """
    Encodes non-numeric columns using OneHotEncoder and scales numeric columns using StandardScaler.

    Parameters:
        df (pd.DataFrame): Input dataset as a pandas DataFrame.

    Returns:
        pd.DataFrame: Transformed dataset with encoded and scaled features.
    """
    # Identify numeric and categorical columns
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Define transformers for numeric and categorical features
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

    # Combine transformers in a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Apply transformations
    transformed_data = preprocessor.fit_transform(df)

    # Get column names for transformed data
    cat_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_columns = numeric_features + list(cat_columns)

    # Create a DataFrame with transformed data
    transformed_df = pd.DataFrame(transformed_data, columns=all_columns, index=df.index)

    return transformed_df

In [83]:
dataset_nf_proc = my_pipeline(dataset_new_features)
dataset_cl_proc = my_pipeline(dataset_cleaned)

In [84]:
dataset_new_features.shape, train_proc_dnn.shape

((2919, 84), (1460, 286))

In [97]:
# DNNs
train_proc_dnn = dataset_cl_proc[:train_raw.shape[0]]
target = train_raw['SalePrice'].to_numpy()
X_train_dnn, X_val_dnn, y_train_dnn, y_val_dnn = train_test_split(train_proc_dnn, target,
                                                                  test_size=0.2,
                                                                  random_state=42)
X_train_dnn = X_train_dnn.astype(np.float32)
y_train_dnn = y_train_dnn.astype(np.float32)
X_val_dnn = X_val_dnn.astype(np.float32)
y_val_dnn = y_val_dnn.astype(np.float32)

# ML
train_proc_ml = dataset_nf_proc[:train_raw.shape[0]]
X_train_ml, X_val_ml, y_train_ml, y_val_ml = train_test_split(train_proc_ml, target,
                                                                  test_size=0.2,
                                                                  random_state=42)

In [102]:
test_proc_dnn = dataset_cl_proc[train_raw.shape[0]:]
test_proc_ml = dataset_nf_proc[train_raw.shape[0]:]

In [87]:
class XGBRegressorWrapper(RegressorMixin, BaseEstimator):
    def __init__(self, **kwargs):
        self.model = XGBRegressor(**kwargs)
    
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def score(self, X, y):
        return self.model.score(X, y)

In [88]:
def rmse(y_true, y_preds):
    y_true = K.cast(y_true, dtype='float32')
    y_preds = K.cast(y_preds, dtype='float32')
    return K.sqrt(K.mean(K.square(y_true - y_preds)))

In [89]:
# 2. Get models
# a) ML models
    
xgb = joblib.load(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\xgb_model_v1.pkl")
rf = joblib.load(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\rf_model_v1.pkl")
ridge = joblib.load(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\ridge_model_v1.pkl")
lasso = joblib.load(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\lasso_model_v1.pkl")
lr = joblib.load(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\lr_model_v1.pkl")

# b) DNN models
model_dnn_1 = load_model(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\model_dnn_1.h5",
    custom_objects={'rmse': rmse})
model_dnn_2 = load_model(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\model_dnn_2.h5",
    custom_objects={'rmse': rmse})
model_dnn_3 = load_model(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\model_dnn_3.h5",
    custom_objects={'rmse': rmse})
model_dnn_4 = load_model(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\model_dnn_4.h5",
    custom_objects={'rmse': rmse})
model_dnn_5 = load_model(r"C:\Users\Adam\Desktop\main\programming\machine learning\house prices regression\models\model_dnn_5.h5",
    custom_objects={'rmse': rmse})



In [90]:
# Complie DNN models do built the rmse metric
model_dnn_1.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[rmse])
model_dnn_2.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[rmse])
model_dnn_3.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[rmse])
model_dnn_4.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[rmse])
model_dnn_5.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[rmse])
print('Models succesfuly compiled.')

Models succesfuly compiled.


In [91]:
test_proc_dnn.shape

(1459, 286)

In [129]:
# 3. Averaging- here we need to make predictions both on DNNs and ML models. Then
# a) predictions
dnn_models = {'model_dnn_1':model_dnn_1,
              'model_dnn_2':model_dnn_2,
              'model_dnn_3':model_dnn_3,
              'model_dnn_4':model_dnn_4,
              'model_dnn_5':model_dnn_5}

predictions_dnn = []
# train models
for name, model in dnn_models.items():
    history = model.fit(X_train_dnn, y_train_dnn,
              validation_data=(X_val_dnn, y_val_dnn),
              epochs=80,
              batch_size=64,
              verbose=0)
    y_preds = model.predict(test_proc_dnn)
    predictions_dnn.append(y_preds)



[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 995us/step
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [130]:
predictions_dnn_mean = np.array(predictions_dnn).mean(axis=0)

In [131]:
voting_regressor = VotingRegressor(estimators=[
    ('xgb', xgb),
    ('rf', rf),
    ('lasso', lasso),
    ('ridge', ridge),
    ('lr', lr)
])
voting_regressor.fit(X_train_ml, y_train_ml)

In [132]:
y_preds = voting_regressor.predict(test_proc_ml)

In [133]:
y_preds.shape, predictions_dnn_mean.shape

((1459,), (1459, 1))

In [134]:
predictions_dnn_mean

array([[101555.445],
       [132004.08 ],
       [155925.3  ],
       ...,
       [145568.55 ],
       [ 90838.69 ],
       [185946.47 ]], shape=(1459, 1), dtype=float32)

In [139]:
preds = y_preds.reshape(-1,1)
mean_preds = (predictions_dnn_mean + preds) / 2

In [145]:
preds = y_preds.reshape(-1,1)
mean_preds = (predictions_dnn_mean + preds) / 2
submission_mean_dnn_ml = pd.DataFrame({'Id':test_raw['Id'],'SalePrice':mean_preds.flatten()})

In [147]:
submission_mean_dnn_ml.to_csv('submission_mean_dnn_ml.csv',index=False)