# Decison Tree - QRT ENS Data Challenge


## Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor


## Loading data


In [None]:
#load original dataset

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [None]:
X_train.head()

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,1054,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,...,-0.444661,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445
1,2049,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,...,-1.183194,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
2,1924,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,...,1.947273,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
3,297,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,...,-0.976974,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
4,1101,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,...,-0.526267,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378


In [None]:
y_train.head()

Unnamed: 0,ID,TARGET
0,1054,0.028313
1,2049,-0.112516
2,1924,-0.18084
3,297,-0.260356
4,1101,-0.071733


## Decision Tree and train score

In [None]:
# Create a decision tree regressor
X_train_clean = X_train.drop(['COUNTRY'], axis=1).fillna(0)
y_train_clean = y_train['TARGET']

dt = DecisionTreeRegressor(random_state=42)

# Fit the decision tree on the data
dt.fit(X_train_clean, y_train_clean)

# Predict on the training set
output_train_dt = dt.predict(X_train_clean)

# Define the metric function
def metric_train(output):
    return spearmanr(output, y_train_clean).correlation

# Calculate and print the Spearman correlation for the train set
print('Spearman correlation for the train set using decision tree: {:.1f}%'.format(100 * metric_train(output_train_dt)))


Spearman correlation for the train set using decision tree: 100.0%


The Spearman correlation obtained with our model on the train data set is 100%.

## Generate the benchmark output

Next, we process the test set the same way as we did on the train set.

In [None]:
X_test.head()

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,1115,241,FR,0.340083,-0.433604,-0.423521,0.423521,0.165333,0.519419,-0.165333,...,-0.222525,-0.51318,-0.182048,-0.982546,-0.876632,0.880491,0.692242,0.569419,-0.029697,-0.929256
1,1202,1214,FR,0.803209,0.780411,0.60161,-0.60161,0.342802,0.555367,-0.342802,...,0.857739,-0.340595,-0.301094,-0.759816,-1.221443,-0.616617,-0.737496,0.251251,0.753646,0.664086
2,1194,1047,FR,0.79554,0.721954,1.179158,-1.179158,1.620928,0.666901,-1.620928,...,0.447967,0.796475,-0.367248,0.376055,-0.483363,0.865138,0.120079,-1.485642,-0.32645,-0.349747
3,1084,1139,FR,0.172555,-0.723427,-0.044539,0.044539,,-0.205276,,...,-0.561295,-0.542606,-0.013291,-0.791119,-0.894309,0.239153,0.457457,-0.746863,2.262654,0.642069
4,1135,842,FR,0.949714,0.420236,0.617391,-0.617391,0.608561,-0.240856,-0.608561,...,0.503567,-0.230291,-0.609203,-0.744986,-1.196282,0.176557,0.312557,-2.219626,-0.509272,-0.488341


In [None]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

y_test_submission = X_test[['ID']].copy()
y_test_submission['TARGET'] = dt.predict(X_test_clean)

y_test_submission.to_csv('DecisionTree.csv', index=False)


After submitting this DecisionTree.csv file, we obtain a public score of around -0.41% indicating overfitting in the training set. In the next step, we will try to reduce overfitting by tuning hyperparameters

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_predict

In [None]:
# Define the custom scoring function using Spearman correlation
def custom_spearman_scorer(estimator, X, y):
    predictions = estimator.predict(X)
    spearman_corr, _ = spearmanr(predictions, y)
    return spearman_corr

In [None]:


# Define the parameter grid to search
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object with the custom scoring function
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=custom_spearman_scorer)

# Fit the GridSearchCV object on the data
grid_search.fit(X_train_clean, y_train_clean)

# Get the best estimator from the grid search
tuned_dt = grid_search.best_estimator_

# Predict on the training set using the best model
output_train_best_dt = tuned_dt.predict(X_train_clean)

# Print the best parameters and the Spearman correlation for the train set
print('Best Parameters:', grid_search.best_params_)
print('Spearman correlation for the train set using the best model: {:.1f}%'.format(100 * metric_train(output_train_best_dt)))


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Spearman correlation for the train set using the best model: 37.9%


In [None]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

y_test_submission = X_test[['ID']].copy()
y_test_submission['TARGET'] = tuned_dt.predict(X_test_clean)

y_test_submission.to_csv('DecisionTree_tuned.csv', index=False)

After submitting the csv file on Challenge, we obtained a public score around -0.53% which is not an improvement from the default parameters. Therefore, we will try tuning criterion, increase the number of min_samples_split and min_sample_leaf in the grid search and decrease max_depth.

In [None]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [5, 10, 12],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [5, 10, 20],
    'criterion': ['absolute_error', 'friedman_mse', 'squared_error']
}

# Update GridSearchCV object with the custom scoring function
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring=custom_spearman_scorer)

# Fit the GridSearchCV object on the data
grid_search.fit(X_train_clean, y_train_clean)

# Get the best estimator from the grid search
best_dt = grid_search.best_estimator_

# Predict on the training set using the best model
output_train_best_dt = best_dt.predict(X_train_clean)

# Print the best parameters and the Spearman correlation for the train set
print('Best Parameters:', grid_search.best_params_)
print('Spearman correlation for the train set using the best model: {:.1f}%'.format(100 * metric_train(output_train_best_dt)))

Best Parameters: {'criterion': 'absolute_error', 'max_depth': 12, 'min_samples_leaf': 20, 'min_samples_split': 5}
Spearman correlation for the train set using the best model: 49.9%


After tuning, our best set of parameters is 'criterion': 'absolute_error', 'max_depth': 12, 'min_samples_leaf': 20, 'min_samples_split': 5.
Spearman correlation for the train set using the best model is 49.9%.

In [None]:
X_test_clean = X_test.drop(['COUNTRY'], axis=1).fillna(0)

y_test_submission = X_test[['ID']].copy()
y_test_submission['TARGET'] = best_dt.predict(X_test_clean)

y_test_submission.to_csv('DecisionTree_tune2.csv', index=False)

After submitting the csv file on Challenge, we obtained a public score around 13.91% which is a significant improvement from the previous set of parameters.