In [10]:
import numpy as np
import os

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

In [11]:
# Error metrics
error_metrics = {
    'MSE': mean_squared_error,
    'rMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    'relative': lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
    'relativeSE': lambda y_true, y_pred: np.mean(np.square((y_true - y_pred) / y_true)) * 100,
    'absoluteSE': mean_absolute_error,
    'statistical correlation': r2_score
}

In [12]:
def perform_grid_search(model, param_grid, X_train, y_train, X_test, y_test, model_name):
    grd = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error',
                       verbose=2, n_jobs=-1)
    grd.fit(X_train, y_train)
    best = grd.best_params_
    print(f"Best Parameters for {model_name}:", best)

    model_ = model.set_params(**best)
    model_.fit(X_train, y_train)
    y_pred = model_.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for {model_name} (Best Model):", mse)

    return model_, mse, y_pred

In [13]:
def model_selection(models, X_train, y_train, X_test, y_test, error_metrics):
    model_errors = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        errors = {}
        for error_name, error_func in error_metrics.items():
            errors[error_name] = error_func(y_test, y_pred)

        model_errors[model_name] = errors

    return model_errors

In [14]:
def run_models(X_train, X_test, y_train, y_test, validation, validation_ids, dir_path):
    # K-Nearest Neighbors
    knn_model = KNeighborsRegressor()
    knn_param_grid = {'n_neighbors': [5, 17, 18, 19], 'weights': ['uniform', 'distance'],
                      'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': [5, 10, 15],
                      'p': [1, 2]}
    knn_best_model, knn_mse, knn_y_pred = perform_grid_search(knn_model, knn_param_grid, X_train, y_train, X_test,
                                                              y_test,
                                                              'KNN')

    # Decision Tree
    dt_model = DecisionTreeRegressor(random_state=42)
    dt_param_grid = {'max_depth': [None, 4, 5, 6, 8], 'min_samples_split': [2, 14, 15, 16, 17],
                     'min_samples_leaf': [1, 8, 10, 12]}
    dt_best_model, dt_mse, dt_y_pred = perform_grid_search(dt_model, dt_param_grid, X_train, y_train, X_test, y_test,
                                                           'Decision Tree')

    # Support Vector Machine
    svm_model = SVR()
    svm_param_grid = {'C': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
    svm_best_model, svm_mse, svm_y_pred = perform_grid_search(svm_model, svm_param_grid, X_train, y_train, X_test,
                                                              y_test,
                                                              'SVM')

    # XGBoost
    xgb_model = xgb.XGBRegressor(random_state=42)
    xgb_param_grid = {
        'n_estimators': [40, 50, 65, 75, 100, 200],
        'max_depth': [2, 3, 5, 7, 9],
        'subsample': [0.7, 0.8, 0.9, 1],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
    }
    xgb_best_model, xgb_mse, xgb_y_pred = perform_grid_search(xgb_model, xgb_param_grid, X_train, y_train, X_test,
                                                              y_test,
                                                              'XGBoost')

    # Random Forest
    rf_model = RandomForestRegressor(random_state=42)
    rf_param_grid = {
        #estimators': [100, 200, 300, 500],
        'max_depth': [5, 10, 15, None],
        'min_samples_leaf': [1, 2, 4],
        #'min_samples_split': [2, 5, 10]
    }
    rf_best_model, rf_mse, rf_y_pred = perform_grid_search(rf_model, rf_param_grid, X_train, y_train, X_test, y_test,
                                                           'Random Forest')

    # AdaBoost
    ada_model = AdaBoostRegressor(random_state=42)
    ada_param_grid = {
        'n_estimators': [50, 100, 150, 200, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.5, 1]
    }
    ada_best_model, ada_mse, ada_y_pred = perform_grid_search(ada_model, ada_param_grid, X_train, y_train, X_test,
                                                              y_test,
                                                              'AdaBoost')

    # Bayesian Ridge
    bayesian_ridge_model = BayesianRidge()
    bay_param_grid = {'max_iter': [50, 100, 200, 300, 400, 500], 'tol': [0.01, 0.002, 1e-3, 1e-4, 1e-5, 1e-6]}
    bay_best_model, bay_mse, bay_y_pred = perform_grid_search(bayesian_ridge_model, bay_param_grid, X_train, y_train,

                                                              X_test, y_test, 'Bayesian Ridge')

    # Linear Regression
    linear_regression_model = LinearRegression()
    linear_param_grid = {'fit_intercept': [True, False], 'copy_X': [True, False]}
    linear_best_model, linear_mse, linear_y_pred = perform_grid_search(linear_regression_model, linear_param_grid,
                                                                       X_train,
                                                                       y_train,
                                                                       X_test, y_test, 'Linear Regression')

    # Ridge Regression
    ridge_regression_model = Ridge()
    ridge_param_grid = {'alpha': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1, 2, 5], 'fit_intercept': [True, False],
                        'copy_X': [True, False]}
    ridge_best_model, ridge_mse, ridge_y_pred = perform_grid_search(ridge_regression_model, ridge_param_grid, X_train,
                                                                    y_train,
                                                                    X_test, y_test, 'Ridge Regression')

    # Lasso Regression
    lasso_regression_model = Lasso()
    lasso_param_grid = {'alpha': [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1], 'fit_intercept': [True, False],
                        'copy_X': [True, False]}
    lasso_best_model, lasso_mse, lasso_y_pred = perform_grid_search(lasso_regression_model, lasso_param_grid, X_train,
                                                                    y_train,
                                                                    X_test, y_test, 'Lasso Regression')

    # K-Means
    kmeans_model = KMeans()
    kmeans_param_grid = {'n_clusters': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]}
    kmeans_best_model, kmeans_mse, kmeans_y_pred = perform_grid_search(kmeans_model, kmeans_param_grid, X_train,
                                                                       y_train,
                                                                       X_test, y_test, 'K-Means')

    # Mean Shift
    mean_shift_model = MeanShift()
    mean_shift_param_grid = {}
    mean_shift_best_model, mean_shift_mse, mean_shift_y_pred = perform_grid_search(mean_shift_model,
                                                                                   mean_shift_param_grid, X_train,
                                                                          y_train,
                                                                            X_test, y_test, 'Mean Shift')


    # Neural Network
    nn_model = MLPRegressor()
    nn_param_grid = {'hidden_layer_sizes': [(300,), (400,), (300, 200), (400, 300), (200, 300)],
                     'activation': ['relu', 'tanh', 'logistic'], 'solver': ['adam', 'sgd'],
                     'learning_rate': ['constant', 'invscaling', 'adaptive']}
    nn_best_model, nn_mse, nn_y_pred = perform_grid_search(nn_model, nn_param_grid, X_train, y_train, X_test, y_test,
                                                           'Neural Network')

    # Models
    models = {
        'KNN': knn_best_model,
        'Decision Tree': dt_best_model,
        'SVM': svm_best_model,
        'XGBoost': xgb_best_model,
        'Random Forest': rf_best_model,
        'AdaBoost': ada_best_model,
        'Bayesian Ridge': bay_best_model,
        'Linear Regression': linear_best_model,
        'Ridge Regression': ridge_best_model,
        'Lasso Regression': lasso_best_model,
        'K-Means': kmeans_best_model,
        'Mean Shift': mean_shift_best_model,
        'Neural Network': nn_best_model
    }

    # Perform model selection
    model_errors = model_selection(models, X_train, y_train, X_test, y_test, error_metrics)

    # Print model errors
    for model_name, errors in model_errors.items():
        print(f"{model_name}:")
        for error_name, error_value in errors.items():
            print(f"  {error_name}: {error_value}")
        print()

    # Ensemble all models
    knn_y_pred = knn_best_model.predict(X_test)
    dt_y_pred = dt_best_model.predict(X_test)
    svm_y_pred = svm_best_model.predict(X_test)
    xgb_y_pred = xgb_best_model.predict(X_test)
    rf_y_pred = rf_best_model.predict(X_test)
    ada_y_pred = ada_best_model.predict(X_test)
    bay_y_pred = bay_best_model.predict(X_test)
    linear_y_pred = linear_best_model.predict(X_test)
    ridge_y_pred = ridge_best_model.predict(X_test)
    lasso_y_pred = lasso_best_model.predict(X_test)
    kmeans_y_pred = kmeans_best_model.predict(X_test)
    mean_shift_y_pred = mean_shift_best_model.predict(X_test)
    neural_y_pred = nn_best_model.predict(X_test)

    ensemble_y_pred = (
                              knn_y_pred + dt_y_pred + svm_y_pred + xgb_y_pred + rf_y_pred + ada_y_pred + bay_y_pred + linear_y_pred + ridge_y_pred + lasso_y_pred + kmeans_y_pred + mean_shift_y_pred + neural_y_pred) / 13

    with open(dir_path + 'ensemble_error.txt', 'w') as f:
        for error in error_metrics:
            error_rate = error_metrics[error](y_test, ensemble_y_pred)
            print(f"Ensemble {error}:", error_rate)
            f.write(f"Ensemble {error}: {error_rate}\n")

    best_models = sorted(model_errors.items(), key=lambda x: x[1]['MSE'])[:3]
    best_models = [model[0] for model in best_models]

    ensemble_y_pred_best = np.zeros(len(y_test))
    for model_name in best_models:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        ensemble_y_pred_best += y_pred / 3

    with open(dir_path + 'ensemble_best_error.txt', 'w') as f:
        for error in error_metrics:
            error_rate = error_metrics[error](y_test, ensemble_y_pred_best)
            print(f"Ensemble {error}:", error_rate)
            f.write(f"Ensemble {error}: {error_rate}\n")

    knn_y_pred = knn_best_model.predict(validation)
    dt_y_pred = dt_best_model.predict(validation)
    svm_y_pred = svm_best_model.predict(validation)
    xgb_y_pred = xgb_best_model.predict(validation)
    rf_y_pred = rf_best_model.predict(validation)
    ada_y_pred = ada_best_model.predict(validation)
    bay_y_pred = bay_best_model.predict(validation)
    linear_y_pred = linear_best_model.predict(validation)
    ridge_y_pred = ridge_best_model.predict(validation)
    lasso_y_pred = lasso_best_model.predict(validation)
    kmeans_y_pred = kmeans_best_model.predict(validation)
    mean_shift_y_pred = mean_shift_best_model.predict(validation)
    neual_y_pred = nn_best_model.predict(validation)
    ensemble_y_pred = (
                              knn_y_pred + dt_y_pred + svm_y_pred + xgb_y_pred + rf_y_pred + ada_y_pred + bay_y_pred + linear_y_pred + ridge_y_pred + lasso_y_pred + kmeans_y_pred + mean_shift_y_pred + neual_y_pred) / 13

    ensemble_y_pred_best = np.zeros(len(validation))
    for model_name in best_models:
        model = models[model_name]
        model.fit(X_train, y_train)
        y_pred = model.predict(validation)
        ensemble_y_pred_best += y_pred / 3

    predictions = {
        'KNN': (knn_mse, knn_y_pred),
        'Decision Tree': (dt_mse, dt_y_pred),
        'SVM': (svm_mse, svm_y_pred),
        'XGBoost': (xgb_mse, xgb_y_pred),
        'Random Forest': (rf_mse, rf_y_pred),
        'AdaBoost': (ada_mse, ada_y_pred),
        'Bayesian Ridge': (bay_mse, bay_y_pred),
        'Linear Regression': (linear_mse, linear_y_pred),
        'Ridge Regression': (ridge_mse, ridge_y_pred),
        'Lasso Regression': (lasso_mse, lasso_y_pred),
        'K-Means': (kmeans_mse, kmeans_y_pred),
        'Mean Shift': (mean_shift_mse, mean_shift_y_pred),
        'Neural Network': (nn_mse, neural_y_pred)
    }

    for algo_name, (errors, preds) in predictions.items():
        preds_df = pd.DataFrame()
        preds_df['id'] = validation_ids
        preds_df = preds_df.join(pd.DataFrame(preds, columns=['score']))
        preds_df.to_csv(dir_path + f'{algo_name}_pred.csv', index=False)
        with open(dir_path + f'{algo_name}_error.txt', 'w') as f:
            for model_name, errors in model_errors.items():
                if model_name == algo_name:
                    for error_name, error_value in errors.items():
                        f.write(f"{error_name} for {algo_name}: {error_value}\n")

    end_preds_df = pd.DataFrame()
    end_preds_df['id'] = validation_ids
    ensemble_preds_df = end_preds_df.join(pd.DataFrame(ensemble_y_pred, columns=['score']))
    ensemble_preds_df.to_csv(dir_path + 'ensemble_pred.csv', index=False)
    ensemble_preds_best_df = end_preds_df.join(pd.DataFrame(ensemble_y_pred_best, columns=['score']))
    ensemble_preds_best_df.to_csv(dir_path + 'ensemble_best_pred.csv', index=False)


In [15]:
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import SparsePCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

In [16]:
def dataset_transform(X, y, validation, test_size=0.2, random_state=42, preprocessing='StandardScaler', preprocessing_params=None, dim_reduction=None,
                      dim_reduction_params=None):

    data = pd.merge(X, y, on='id')
    data = data.drop(['id'], axis=1)

    validation_id = validation['id']
    validation = validation.drop(['id'], axis=1)

    train = data.drop(['score'], axis=1)
    y_ = data['score']

    X_train, X_test, y_train, y_test = train_test_split(train, y_,
                                                        test_size=test_size,
                                                        random_state=random_state)

    preprocessing_steps = []
    preprocessing_params = preprocessing_params or {}
    scaler = {
        'StandardScaler': StandardScaler(**preprocessing_params),
        'MinMaxScaler': MinMaxScaler(**preprocessing_params),
        'RobustScaler': RobustScaler(**preprocessing_params),
        'Normalizer': Normalizer(**preprocessing_params),
        'QuantileTransformer': QuantileTransformer(**preprocessing_params),
        'PowerTransformer': PowerTransformer(**preprocessing_params),
        'PolynomialFeatures': PolynomialFeatures(**preprocessing_params),
    }.get(preprocessing)
    if scaler:
        preprocessing_steps.append(('scaler', scaler))

    dim_reduction_params = dim_reduction_params or {}
    reducer = {
        'PCA': PCA(**dim_reduction_params),
        'KernelPCA': KernelPCA(**dim_reduction_params),
        'SparsePCA': SparsePCA(**dim_reduction_params),
        'TruncatedSVD': TruncatedSVD(**dim_reduction_params),
        'FactorAnalysis': FactorAnalysis(**dim_reduction_params),
    }.get(dim_reduction)
    if reducer:
        preprocessing_steps.append(('reducer', reducer))

    if preprocessing_steps:
        pipeline = Pipeline(steps=preprocessing_steps)
        X_train = pipeline.fit_transform(X_train)
        X_test = pipeline.transform(X_test)
        validation = pipeline.transform(validation)
    else:
        raise ValueError("No valid preprocessing or dimensionality reduction method provided")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    dir_path = f"results/{preprocessing}_{dim_reduction}/"
    return X_train, X_test, y_train, y_test, validation, validation_id, dir_path

In [17]:
def main():
    X_train = pd.read_csv('pc_X_train.csv')
    y_train = pd.read_csv('pc_y_train.csv')
    validation = pd.read_csv('pc_X_test.csv')

    X_train, X_test, y_train, y_test, validation, validation_id, dir_path = dataset_transform(
        X=X_train,
        y=y_train,
        validation=validation,
        test_size=0.2,
        random_state=42,
        preprocessing='PowerTransformer', # StandardScaler, MinMaxScaler, RobustScaler, Normalizer, QuantileTransformer, PowerTransformer, PolynomialFeatures
        preprocessing_params={},
        dim_reduction='SparsePCA',  # PCA, KernelPCA, SparsePCA, TruncatedSVD, FactorAnalysis
        dim_reduction_params={},
    )

    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    run_models(X_train, X_test, y_train, y_test, validation, validation_id, dir_path)

In [18]:
main()

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


X_train shape: (1580, 468)
X_test shape: (396, 468)
y_train shape: (1580,)
y_test shape: (396,)
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best Parameters for KNN: {'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
Mean Squared Error for KNN (Best Model): 0.4306189724143329
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters for Decision Tree: {'max_depth': 4, 'min_samples_leaf': 12, 'min_samples_split': 2}
Mean Squared Error for Decision Tree (Best Model): 0.5142714439970607
Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters for SVM: {'C': 0.7, 'kernel': 'rbf'}
Mean Squared Error for SVM (Best Model): 0.4066157560135386
Fitting 5 folds for each of 720 candidates, totalling 3600 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.9, 'max_depth': 2, 'n_estimators': 40, 'subsample': 1}
Mean Squared Error for XGBoost (Best Model): 0.3992076013410574
Fitting 5 folds for each 

  super()._check_params_vs_input(X, default_n_init=10)


Best Parameters for K-Means: {'n_clusters': 8}


  super()._check_params_vs_input(X, default_n_init=10)


Mean Squared Error for K-Means (Best Model): 7.127525252525253
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters for Mean Shift: {}
Mean Squared Error for Mean Shift (Best Model): 14.738636363636363
Fitting 5 folds for each of 90 candidates, totalling 450 fits




Best Parameters for Neural Network: {'activation': 'logistic', 'hidden_layer_sizes': (300,), 'learning_rate': 'adaptive', 'solver': 'sgd'}




Mean Squared Error for Neural Network (Best Model): 0.4266798732192248


  super()._check_params_vs_input(X, default_n_init=10)


KNN:
  MSE: 0.4306189724143329
  rMSE: 0.6562156447497521
  relative: 16.64961631487319
  relativeSE: 8.242944048742777
  absoluteSE: 0.5015972494414369
  statistical correlation: 0.5564987437483924

Decision Tree:
  MSE: 0.5142714439970607
  rMSE: 0.7171272160482133
  relative: 18.368724430519404
  relativeSE: 10.011709343773733
  absoluteSE: 0.5496771143516019
  statistical correlation: 0.47034374684362334

SVM:
  MSE: 0.4066157560135386
  rMSE: 0.6376642972705455
  relative: 16.365033109957213
  relativeSE: 8.376035371979142
  absoluteSE: 0.4831021986902062
  statistical correlation: 0.5812200340532436

XGBoost:
  MSE: 0.3992076013410574
  rMSE: 0.6318287753347875
  relative: 15.866879224073507
  relativeSE: 7.045204457879708
  absoluteSE: 0.48508832460702067
  statistical correlation: 0.5888498091310364

Random Forest:
  MSE: 0.393091856197531
  rMSE: 0.6269703790431658
  relative: 15.678937271260487
  relativeSE: 7.009036889425538
  absoluteSE: 0.476193562563964
  statistical corr