In [1]:
#Mounting on drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

def load_datasets(x_train_path, y_train_path, x_test_path):
    # Loading datasets
    x_train = pd.read_csv(x_train_path)
    y_train = pd.read_csv(y_train_path)
    x_test = pd.read_csv(x_test_path)

    # Replace missing values with 0
    x_train.replace(' ', 0, inplace=True)
    y_train.replace(' ', 0, inplace=True)
    x_test.replace(' ', 0, inplace=True)


    # Filling missing values with 0
    x_train.fillna(0, inplace=True)
    y_train.fillna(0, inplace=True)
    x_test.fillna(0, inplace=True)


    # Describing the datasets to check for missing values
    dataframes_info = {
        'X_train': x_train.describe(include='all'),
        'Y_train': y_train.describe(include='all'),
        'X_test': x_test.describe(include='all')
    }

    # Output the descriptions
    for name, df_info in dataframes_info.items():
        print(f'{name} Summary:\n', df_info)
        print(f'{name} Missing Values:\n', df_info.isnull().sum())
    x_train.to_csv('filled_x_train.csv', index=False)
    y_train.to_csv('filled_y_train.csv', index=False)
    x_test.to_csv('filled_x_test.csv', index=False)
    return x_train, y_train, x_test

#Paths to the files
x_train_path = "/content/drive/MyDrive/ensemble/project/X_train_NHkHMNU.csv"
y_train_path = "/content/drive/MyDrive/ensemble/project/y_train_ZAN5mwg.csv"
x_test_path = "/content/drive/MyDrive/ensemble/project/X_test_final.csv"


#Loading datasets
x_train, y_train, x_test = load_datasets(x_train_path, y_train_path, x_test_path)


X_train Summary:
                  ID       DAY_ID COUNTRY  DE_CONSUMPTION  FR_CONSUMPTION  \
count   1494.000000  1494.000000    1494     1494.000000     1494.000000   
unique          NaN          NaN       2             NaN             NaN   
top             NaN          NaN      FR             NaN             NaN   
freq            NaN          NaN     851             NaN             NaN   
mean    1072.759036   591.861446     NaN        0.427442       -0.020032   
std      618.013179   345.065043     NaN        0.673412        0.918995   
min        0.000000     0.000000     NaN       -2.265563       -1.462350   
25%      540.250000   292.250000     NaN       -0.037421       -0.716771   
50%     1077.500000   591.000000     NaN        0.357061       -0.394166   
75%     1597.500000   885.750000     NaN        0.922057        0.650533   
max     2146.000000  1215.000000     NaN        2.033851        3.300640   

        DE_FR_EXCHANGE  FR_DE_EXCHANGE  DE_NET_EXPORT  FR_NET_EXPORT 

In [3]:
from sklearn.model_selection import train_test_split
# Split dataset into train and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

class Boosting:
    def __init__(self, base_models):
        self.base_models = base_models
        self.models = []

    def fit(self, X, y):
        for base_model in self.base_models:
            model = base_model
            # Using only the second column of y
            model.fit(X, y.iloc[:, 1])
            self.models.append(model)

    def predict(self, X):
        predictions = np.zeros((len(X), len(self.models)))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        # sum of predictions for regression
        return np.sum(predictions, axis=1)


In [5]:
# Drop the original 'country' column from X_train and X_val
X_train.drop(columns=['COUNTRY'], inplace=True)
X_val.drop(columns=['COUNTRY'], inplace=True)

In [6]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

Applying grid search method to find best parameters

In [69]:
from sklearn.model_selection import GridSearchCV

# parameter grid for RandomForestRegressor
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [ 5,10],
    'min_samples_leaf': [1, 2, 4]
}

#  parameter grid for DecisionTreeRegressor
param_grid_dt = {
    'max_depth': [None, 10],
    'min_samples_split': [ 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Performing GridSearch on RandomForestRegressor
grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, Y_train.iloc[:, 1])

# Perform GridSearch on DecisionTreeRegressor
grid_search_dt = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_search_dt.fit(X_train, Y_train.iloc[:, 1])

# Best parameters and best score for RandomForestRegressor
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

# Best parameters and best score for DecisionTreeRegressor
best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

print("Best parameters for RandomForestRegressor:", best_params_rf)
print("Best score for RandomForestRegressor:", best_score_rf)
print("Best parameters for DecisionTreeRegressor:", best_params_dt)
print("Best score for DecisionTreeRegressor:", best_score_dt)


Best parameters for RandomForestRegressor: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score for RandomForestRegressor: -1.0427244029427645
Best parameters for DecisionTreeRegressor: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score for DecisionTreeRegressor: -1.234198844075998


In [70]:
#rf_max_depth = 10
#dt_max_depth = 5
rf_model = RandomForestRegressor(**best_params_rf)
dt_model = DecisionTreeRegressor(**best_params_dt)


#Combining best parameters
base_models = [rf_model, dt_model]

In [71]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

boosting = Boosting(base_models=base_models)

# Train the ensemble model obtained using randomforest and decision tree
boosting.fit(X_train, Y_train)
y_pred_train = boosting.predict(X_train)

# Make predictions on the validation set
y_pred_val = boosting.predict(X_val)

# Mean Absolute Error (MAE) on training data
train_mae = mean_absolute_error(Y_train.iloc[:, 1], y_pred_train)

print("Training Mean Absolute Error:", train_mae)
# Mean Squared Error (MSE)  on training data
train_mse = mean_squared_error(Y_train.iloc[:, 1], y_pred_train)
print("Training Mean Squared Error:", train_mse)
#Mean Absolute Error (MAE) on validation data
val_mae = mean_absolute_error(Y_val.iloc[:, 1], y_pred_val)

print("Validation Mean Absolute Error:", val_mae)
# Mean Squared Error (MSE) on validation data
val_mse = mean_squared_error(Y_val.iloc[:, 1], y_pred_val)
print("Validation Mean Squared Error:", val_mse)



Training Mean Absolute Error: 0.44552083233946854
Training Mean Squared Error: 0.5091368284902642
Validation Mean Absolute Error: 0.8217926138335472
Validation Mean Squared Error: 1.8255792342578128


Finding cross validation scores

In [72]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
class BoostingWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, base_models):
        self.base_models = base_models
        self.models = []

    def fit(self, X, y):
        for base_model in self.base_models:
            model = base_model
            model.fit(X, y)
            self.models.append(model)

    def predict(self, X):
        predictions = np.zeros((len(X), len(self.models)))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        return np.sum(predictions, axis=1)

X_full_train = pd.concat([X_train, X_val])
Y_full_train = pd.concat([Y_train, Y_val])

# Using BoostingWrapper for cross-validation
boosting_wrapper = BoostingWrapper(base_models)

# Perform cross-validation
cv_scores = cross_val_score(boosting_wrapper, X_full_train, Y_full_train.iloc[:, 1], cv=5, scoring='neg_mean_absolute_error')

# Convert scores to positive since cross_val_score returns negative values for MAE
cv_scores = -cv_scores

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)

# Calculate the mean cross-validation score
mean_cv_score = cv_scores.mean()
print("Mean Cross-Validation Score:", mean_cv_score)


Cross-Validation Scores: [0.79396484 0.74909851 0.80145244 0.76595311 0.81393416]
Mean Cross-Validation Score: 0.7848806117580791


In [75]:
from sklearn.model_selection import KFold

# Define the number of folds
n_folds = 5

# Initialize KFold object
kf = KFold(n_splits=n_folds)

# Initialize an empty list to store cross-validation scores
cv_scores_val = []

# Loop over the folds
for train_index, val_index in kf.split(X_val):
    # No need to split the validation data further, use X_val and Y_val directly
    X_val_fold, Y_val_fold = X_val.iloc[val_index], Y_val.iloc[val_index]

    # Train the model on the training data
    boosting.fit(X_train, Y_train)

    # Make predictions on the validation fold
    y_pred_val_fold = boosting.predict(X_val_fold)

    # Calculate mean absolute error on the validation fold
    val_mae_fold = mean_absolute_error(Y_val_fold.iloc[:, 1], y_pred_val_fold)

    # Append the validation fold score to the list of cross-validation scores
    cv_scores_val.append(val_mae_fold)

# Convert scores to positive since cross_val_score returns negative values for MAE
cv_scores_val = np.array(cv_scores_val)

# Print the cross-validation scores on validation data
print("Cross-Validation Scores on Validation Data:", cv_scores_val)

# Calculate the mean cross-validation score on validation data
mean_cv_score_val = cv_scores_val.mean()
print("Mean Cross-Validation Score on Validation Data:", mean_cv_score_val)


Cross-Validation Scores on Validation Data: [1.38124976 1.64341639 2.22874624 2.66154269 3.21543169]
Mean Cross-Validation Score on Validation Data: 2.2260773548611015


In [73]:
from scipy.stats import spearmanr

# Compute Spearman's correlation coefficient for training data
#spearman_corr_train, _ = spearmanr(Y_train.iloc[:, 1], y_pred_train)


spearman_coefficient, _ = spearmanr(Y_full_train.iloc[:, 1], boosting_wrapper.predict(X_full_train))
print("Spearman's correlation coefficient for training data:", spearman_coefficient)
# Compute Spearman's correlation coefficient for validation data
spearman_corr_val, _ = spearmanr(Y_val.iloc[:, 1], y_pred_val)
print("Spearman's correlation coefficient for validation data:", spearman_corr_val)


Spearman's correlation coefficient for training data: 0.7449966118576165
Spearman's correlation coefficient for validation data: 0.2150214361069336


Predicting on test data

In [None]:

x_test.drop(columns=['COUNTRY'], inplace=True)
# Predictions on the test data
y_pred_test = boosting.predict(x_test)

# Saving predictions to a CSV file
pd.DataFrame(y_pred_test, columns=['predicted_price_variation']).to_csv("predicted_price_variation.csv", index=False)
