In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
import xgboost as xgb

import numpy as np
import pandas as pd

import os

In [11]:
# Error metrics
error_metrics = {
    'MSE': mean_squared_error,
    'rMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'relative': lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
    'relativeSE': lambda y_true, y_pred: np.mean(np.square((y_true - y_pred) / y_true)) * 100,
    'absoluteSE': mean_absolute_error,
    'statistical correlation': r2_score
}

In [12]:
def perform_grid_search(model, param_grid, X_train, y_train, X_test, y_test, model_name):
    grd = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error',
                       verbose=2, n_jobs=-1)
    grd.fit(X_train, y_train)
    best = grd.best_params_
    print(f"Best Parameters for {model_name}:", best)

    model_ = model.set_params(**best)
    model_.fit(X_train, y_train)
    y_pred = model_.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for {model_name} (Best Model):", mse)

    return model_, mse, y_pred

In [13]:
def model_selection(models, X_train, y_train, X_test, y_test, error_metrics):
    model_errors = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        errors = {}
        for error_name, error_func in error_metrics.items():
            errors[error_name] = error_func(y_test, y_pred)

        model_errors[model_name] = errors

    return model_errors

In [14]:
def run_final(X_train, X_test, y_train, y_test, validation, validation_ids, dir_path):
    # Support Vector Machine
    svm_model = SVR()
    svm_param_grid = {'C':  [0.7, 0.75, 0.8]}
    svm_best_model, svm_mse, svm_y_pred = perform_grid_search(svm_model,
                                                              svm_param_grid,
                                                              X_train,
                                                              y_train,
                                                              X_test,
                                                              y_test,
                                                              'SVM')

    # XGBoost
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_param_grid = {
        'n_estimators': [30, 35, 40, 50],
        'max_depth': [2, 3, 4],
        'subsample': [0.7, 0.8, 0.9, 1],
    }
    xgb_best_model, xgb_mse, xgb_y_pred = perform_grid_search(xgb_model,
                                                              xgb_param_grid,
                                                              X_train,
                                                              y_train,
                                                              X_test,
                                                              y_test,
                                                              'XGBoost')

    # Random Forest
    rf_model = RandomForestRegressor(random_state=42)
    rf_param_grid = {
        # estimators': [100, 200, 300, 500],
        'max_depth': [2, 6, 8, 10, None],
        'min_samples_leaf': [3, 4, 5, 6, 7],
        # 'min_samples_split': [2, 5, 10]
    }
    rf_best_model, rf_mse, rf_y_pred = perform_grid_search(rf_model,
                                                           rf_param_grid,
                                                           X_train,
                                                           y_train,
                                                           X_test,
                                                           y_test,
                                                           'Random Forest')
    # Ridge Regression
    ridge_regression_model = Ridge()
    ridge_param_grid = {'alpha': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8,
                                  1, 2, 4, 5, 6, 10]}
    ridge_best_model, ridge_mse, ridge_y_pred = perform_grid_search(ridge_regression_model,
                                                                    ridge_param_grid,
                                                                    X_train,
                                                                    y_train,
                                                                    X_test, y_test,
                                                                    'Ridge Regression')
    # Models
    models = {
        'SVM': svm_best_model,
        'XGBoost': xgb_best_model,
        'Random Forest': rf_best_model,
        'Ridge Regression': ridge_best_model,
    }

    # Perform model selection
    model_errors = model_selection(models, X_train, y_train,
                                   X_test, y_test, error_metrics)

    # Save model errors
    model_errors_df = pd.DataFrame(model_errors)
    model_errors_df.to_csv(os.path.join(dir_path, 'model_errors.csv'),
                           index=False)

    svm_y_pred = svm_best_model.predict(X_test)
    xgb_y_pred = xgb_best_model.predict(X_test)
    rf_y_pred = rf_best_model.predict(X_test)
    ridge_y_pred = ridge_best_model.predict(X_test)

    # Stacking
    ensemble = (svm_y_pred + xgb_y_pred + rf_y_pred + ridge_y_pred) / 4

    for error in error_metrics:
        error_rate = error_metrics[error](y_test, ensemble)
        print(f"Ensemble {error}:", error_rate)

    best_models_3 = sorted(model_errors.items(), key=lambda x: x[1]['MSE'])[:3]
    best_models_3 = [model[0] for model in best_models_3]
    ensemble_y_pred_3 = np.zeros(len(y_test))
    for model_name in best_models_3:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        ensemble_y_pred_3 += y_pred / 3

    for error in error_metrics:
        error_rate = error_metrics[error](y_test, ensemble_y_pred_3)
        print(f"Ensemble_3 {error}:", error_rate)

    best_models_2 = sorted(model_errors.items(), key=lambda x: x[1]['MSE'])[:3]
    best_models_2 = [model[0] for model in best_models_2]
    ensemble_y_pred_2 = np.zeros(len(y_test))
    for model_name in best_models_2:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        ensemble_y_pred_2 += y_pred / 2

    for error in error_metrics:
        error_rate = error_metrics[error](y_test, ensemble_y_pred_2)
        print(f"Ensemble_2 {error}:", error_rate)

    end_preds_df = pd.DataFrame()
    end_preds_df['id'] = validation_ids

    svm_y_pred = svm_best_model.predict(validation)
    xgb_y_pred = xgb_best_model.predict(validation)
    rf_y_pred = rf_best_model.predict(validation)
    ridge_y_pred = ridge_best_model.predict(validation)
    ensemble = (svm_y_pred + xgb_y_pred + rf_y_pred + ridge_y_pred) / 4
    end_preds_df['score'] = ensemble
    end_preds_df.to_csv(os.path.join(dir_path, 'ensemble.csv'), index=False)

    ensemble_y_pred_3 = np.zeros(len(validation))
    for model_name in best_models_3:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(validation)
        ensemble_y_pred_3 += y_pred / 3
    end_preds_df['score'] = ensemble_y_pred_3
    end_preds_df.to_csv(os.path.join(dir_path, 'ensemble_3.csv'), index=False)

    ensemble_y_pred_2 = np.zeros(len(validation))
    for model_name in best_models_2:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(validation)
        ensemble_y_pred_2 += y_pred / 2
    end_preds_df['score'] = ensemble_y_pred_2
    end_preds_df.to_csv(os.path.join(dir_path, 'ensemble_2.csv'), index=False)

In [15]:
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import SparsePCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

In [16]:
def dataset_transform(X, y, validation, test_size=0.2, random_state=42, preprocessing='StandardScaler', preprocessing_params=None, dim_reduction=None,
                      dim_reduction_params=None):

    data = pd.merge(X, y, on='id')
    data = data.drop(['id'], axis=1)

    validation_id = validation['id']
    validation = validation.drop(['id'], axis=1)

    train = data.drop(['score'], axis=1)
    y_ = data['score']

    X_train, X_test, y_train, y_test = train_test_split(train, y_,
                                                        test_size=test_size,
                                                        random_state=random_state)

    preprocessing_steps = []
    preprocessing_params = preprocessing_params or {}
    scaler = {
        'StandardScaler': StandardScaler(**preprocessing_params),
        'MinMaxScaler': MinMaxScaler(**preprocessing_params),
        'RobustScaler': RobustScaler(**preprocessing_params),
        'Normalizer': Normalizer(**preprocessing_params),
        'QuantileTransformer': QuantileTransformer(**preprocessing_params),
        'PowerTransformer': PowerTransformer(**preprocessing_params),
        'PolynomialFeatures': PolynomialFeatures(**preprocessing_params),
    }.get(preprocessing)
    if scaler:
        preprocessing_steps.append(('scaler', scaler))

    dim_reduction_params = dim_reduction_params or {}
    reducer = {
        'PCA': PCA(**dim_reduction_params),
        'KernelPCA': KernelPCA(**dim_reduction_params),
        'SparsePCA': SparsePCA(**dim_reduction_params),
        'TruncatedSVD': TruncatedSVD(**dim_reduction_params),
        'FactorAnalysis': FactorAnalysis(**dim_reduction_params),
    }.get(dim_reduction)
    if reducer:
        preprocessing_steps.append(('reducer', reducer))

    if preprocessing_steps:
        pipeline = Pipeline(steps=preprocessing_steps)
        X_train = pipeline.fit_transform(X_train)
        X_test = pipeline.transform(X_test)
        validation = pipeline.transform(validation)
    else:
        raise ValueError("No valid preprocessing or dimensionality reduction method provided")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    dir_path = f"results/{preprocessing}_{dim_reduction}/"
    return X_train, X_test, y_train, y_test, validation, validation_id, dir_path

In [17]:
def main():
    X_train = pd.read_csv('pc_X_train.csv')
    y_train = pd.read_csv('pc_y_train.csv')
    validation = pd.read_csv('pc_X_test.csv')

    X_train, X_test, y_train, y_test, validation, validation_id, dir_path = dataset_transform(
        X=X_train,
        y=y_train,
        validation=validation,
        test_size=0.2,
        random_state=42,
        preprocessing='PowerTransformer', # StandardScaler, MinMaxScaler, RobustScaler, Normalizer, QuantileTransformer, PowerTransformer, PolynomialFeatures
        preprocessing_params={},
        dim_reduction='SparsePCA',  # PCA, KernelPCA, SparsePCA, TruncatedSVD, FactorAnalysis
        dim_reduction_params={},
    )

    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    run_final(X_train, X_test, y_train, y_test, validation, validation_id, dir_path)

In [18]:
main()

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


X_train shape: (1580, 468)
X_test shape: (396, 468)
y_train shape: (1580,)
y_test shape: (396,)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameters for SVM: {'C': 0.75}
Mean Squared Error for SVM (Best Model): 0.40683627500306063
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters for XGBoost: {'max_depth': 2, 'n_estimators': 30, 'subsample': 1}
Mean Squared Error for XGBoost (Best Model): 0.40544378264792474
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters for Random Forest: {'max_depth': 8, 'min_samples_leaf': 6}
Mean Squared Error for Random Forest (Best Model): 0.4000606175939326
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for Ridge Regression: {'alpha': 10}
Mean Squared Error for Ridge Regression (Best Model): 0.4070181115297032
Ensemble MSE: 0.3828287347967014
Ensemble rMSE: 0.6187315530960914
Ensemble relative: 15.651057319527395
Ensemble relativeSE: 7.0185947492904255