In [16]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import itertools as it
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression as mir
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder as onehot
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
import xgboost as xgb
import lightgbm as lgb
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [17]:
x_train = pd.read_csv('data/final_dataset/x_train.csv')
y_train = pd.read_csv('data/y_train_ZAN5mwg.csv')
x_test = pd.read_csv('data/final_dataset/x_test.csv')
pd.set_option('display.max_columns', None)

In [18]:
def metric_train(output, test):
    # Calculate MSE and MAE for the benchmark
    mse = mean_squared_error(output, test)
    mae = mean_absolute_error(output, test)
    spearman_corr = spearmanr(output, test).correlation
    return mse, mae, spearman_corr

In [19]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, eta=0.3, gamma=15, max_depth=5)
xgb_model.fit(x_train, y_train['TARGET'])
xgb_test_output = xgb_model.predict(x_train)
print('Spearman correlation for XGB: {:.1f}%'.format(100 * metric_train(xgb_test_output, y_train['TARGET'])[2]))
print('MSE for XGB: {:.1f}'.format(metric_train(xgb_test_output, y_train['TARGET'])[0]))
print('MAE for XGB: {:.1f}'.format(metric_train(xgb_test_output, y_train['TARGET'])[1]))

Spearman correlation for XGB: 27.1%
MSE for XGB: 1.0
MAE for XGB: 0.6


In [23]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 1, 5],
    'eta': [0.3, 0.1, 0.01],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

def custom_spearman_scorer(estimator, X, y):
    predictions = estimator.predict(X)
    spearman_corr, _ = spearmanr(predictions, y)
    return spearman_corr

xgb_model = xgb.XGBRegressor(objective="reg:absoluteerror", random_state=88)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring=custom_spearman_scorer)
grid_search.fit(x_train, y_train['TARGET'])

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best parameters:", best_params)
print("Best score (negative spearman correlation):", best_score)



Best parameters: {'colsample_bytree': 1.0, 'eta': 0.3, 'gamma': 5, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best score (negative spearman correlation): -0.24466535023596386


In [24]:
Best_parameters= {'colsample_bytree': 1.0, 'eta': 0.3, 'gamma': 5, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

best_model = xgb.XGBRegressor(objective="reg:absoluteerror", random_state=88, **Best_parameters)
best_model.fit(x_train, y_train['TARGET'])
xgb_test_output = best_model.predict(x_train)
print('Spearman correlation for XGB: {:.1f}%'.format(100 * metric_train(xgb_test_output, y_train['TARGET'])[2]))
print('MSE for XGB: {:.1f}'.format(metric_train(xgb_test_output, y_train['TARGET'])[0]))
print('MAE for XGB: {:.1f}'.format(metric_train(xgb_test_output, y_train['TARGET'])[1]))

Spearman correlation for XGB: 78.6%
MSE for XGB: 0.9
MAE for XGB: 0.4


In [22]:
# Submission
xgb_test_output = best_model.predict(x_test)
submission = pd.DataFrame({'ID': x_test['ID'], 'TARGET': xgb_test_output})
submission.to_csv('data/submission/submission_3.csv', index=False)