# Ensemble learning: QRT Challenge
# Data cleaning

## Load libraries

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from prophet.diagnostics import cross_validation
from prophet import Prophet
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from sklearn.impute import KNNImputer

## Load file

In [29]:
#Google Drive access
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
X_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ensemble_learning/X_train_NHkHMNU.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ensemble_learning/X_test_final.csv")
y_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ensemble_learning/y_train_ZAN5mwg.csv")

## Data cleaning

In [31]:
#Missing values
missing_values_count_train = X_train.isnull().sum()/len(X_train)*100
missing_values_count_test = X_test.isnull().sum()/len(X_train)*100

print("Missing Values per Variable in %:")
print(missing_values_count_train)

Missing Values per Variable in %:
ID                  0.000000
DAY_ID              0.000000
COUNTRY             0.000000
DE_CONSUMPTION      0.000000
FR_CONSUMPTION      0.000000
DE_FR_EXCHANGE      1.673360
FR_DE_EXCHANGE      1.673360
DE_NET_EXPORT       8.299866
FR_NET_EXPORT       4.685408
DE_NET_IMPORT       8.299866
FR_NET_IMPORT       4.685408
DE_GAS              0.000000
FR_GAS              0.000000
DE_COAL             0.000000
FR_COAL             0.000000
DE_HYDRO            0.000000
FR_HYDRO            0.000000
DE_NUCLEAR          0.000000
FR_NUCLEAR          0.000000
DE_SOLAR            0.000000
FR_SOLAR            0.000000
DE_WINDPOW          0.000000
FR_WINDPOW          0.000000
DE_LIGNITE          0.000000
DE_RESIDUAL_LOAD    0.000000
FR_RESIDUAL_LOAD    0.000000
DE_RAIN             6.291834
FR_RAIN             6.291834
DE_WIND             6.291834
FR_WIND             6.291834
DE_TEMP             6.291834
FR_TEMP             6.291834
GAS_RET             0.000000
COAL_RET 

In [32]:
def preprocessing(df,value):
  ## FIND COLUMNS WITH MISSING VALUE
  missing_values_count = df.isnull().sum()/len(df)*100
  columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()
  columns_with_missing_values = list(set(columns_with_missing_values))

  ## INPUT MISSING VALUE WITH KNNIMPUTER
  knn_imputer = KNNImputer(n_neighbors=5)
  df[columns_with_missing_values] = knn_imputer.fit_transform(df[columns_with_missing_values])

  ## DUMMY ENCONDING
  df['FRANCE'] = df['COUNTRY'].apply(lambda x: 1 if x == 'FR' else 0)
  df['DEUTSCHLAND'] = df['COUNTRY'].apply(lambda x: 1 if x == 'DE' else 0)
  df = df.drop(columns = ['COUNTRY'])

  ## DROP HIGH CORRELATION COL
  high_corr_columns = []
  if value == 'train' and y_train is not None:
      merged_data = df.merge(y_train, how='inner', on='ID')
      corr_matrix = merged_data.corr()

      for i in range(len(corr_matrix.columns)):
          for j in range(i):
              if abs(corr_matrix.iloc[i, j]) > 0.9:
                  colname = corr_matrix.columns[i]  #Column name with a high correlation
                  if corr_matrix.columns[j] not in high_corr_columns:
                      high_corr_columns.append(colname)

      df = df.drop(columns=high_corr_columns)

  if value == 'test':
      corr_matrix = df.corr()

      for i in range(len(corr_matrix.columns)):
          for j in range(i):
              if abs(corr_matrix.iloc[i, j]) > 0.9:
                  colname = corr_matrix.columns[i]  #Column name with a high correlation
                  if corr_matrix.columns[j] not in high_corr_columns:
                      high_corr_columns.append(colname)

      df = df.drop(columns=high_corr_columns)


  ## ADD DATE COL
  start_date = pd.to_datetime('2020-01-01')
  df['DATE'] = df['DAY_ID'].apply(lambda x: start_date + pd.Timedelta(days=x-1))

  return df

X_train_cleaned  = preprocessing(X_train, 'train')
X_test_cleaned = preprocessing(X_test, 'test')

## New CSVs after cleaning data

In [34]:
#Save cleaned datasets to new CSV files locally
X_train_cleaned.to_csv("X_train_cleaned.csv", index=False)
X_test_cleaned.to_csv("X_test_cleaned.csv", index=False)
y_train.to_csv("y_train.csv", index=False)