# Fill missing value with XGBoost

In this Jupyter notebook XGBoost is used as a data imputer to see if its relevant to use it to fill missing data.

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)
import xgboost as xgb

from time import time

## Loading the Dataset and features preparation

### Dataset loading

Loading the CSV file of the dataset.

In [None]:
original_data = pd.read_csv("PJME_hourly.csv")

Renaming the column of the dataset to make it simpler to manipulate, and sort the values by dates.

In [None]:
original_data["Datetime"] = pd.to_datetime(original_data["Datetime"])
original_data.rename({"Datetime" : "date", "PJME_MW" : "out"}, axis=1, inplace=True)
original_data.sort_values("date", ascending=True, inplace=True, ignore_index=True)

Remove duplicated value in the dataset (2 values for the same date, the first value is kept)

In [None]:
original_data.set_index('date', inplace=True)
pd.concat([original_data[original_data.index.duplicated(keep="first") == True],
          original_data[original_data.index.duplicated(keep="last") == True]])
original_data = original_data[~original_data.index.duplicated(keep='first')]
original_data.reset_index(inplace=True)

Set the date as index.

In [None]:
data_features = original_data.copy()

data_features.set_index('date', inplace=True)

### Features extraction

Extraction of the features from the date.

In [None]:
def get_features(df):
    out = df.copy()
    out["hour"] = out.index.hour
    out["day"] = out.index.day
    out["month"] = out.index.month
    out["year"] = out.index.year
    
    out['quarter'] = out.index.quarter
    out['dayofyear'] = out.index.dayofyear
    out['dayofmonth'] = out.index.day
    
    out['weekofyear'] = out.index.isocalendar().week.astype(np.int64)
    return out

data_features = get_features(data_features)

### Adding the N previous points for each point

For each point the previous point data is added to improve the prediction results (see the first experiment on XGBoost prediction)

In [None]:
def get_colums_names(column_names, N):
    column_names = list(column_names)
    names = []
    for i in range(N, 0, -1):
        for name in column_names:
            names.append(name + str(i))
    names.extend(column_names)
    return names

data_features.reset_index(inplace=True)
all_available_features = list(data_features.columns)

N = 2 # Number of points to predict future
data_multiple = data_features.copy()

for i in range(1, N):
    data_multiple = pd.concat([data_multiple.iloc[:-1].reset_index(drop=True), data_features.iloc[i:].reset_index(drop=True)], axis=1)

data_multiple = pd.concat([data_multiple.iloc[:-1].reset_index(drop=True), data_features.iloc[N:].reset_index(drop=True)], axis=1)

data_multiple.columns = get_colums_names(all_available_features, N)
data_multiple.set_index("date", inplace=True)

data_features.set_index("date", inplace=True)

### Training features

Creation of the training features list.

In [None]:
all_features = data_multiple.columns

training_features_list = ['hour', 'day', 'month', 'year', 'quarter', 'dayofyear',
       'dayofmonth', 'weekofyear', 'out']

def is_training_feature(feature, training_features):
    for training_feature in training_features:
        if feature != "out" and \
        training_feature == feature[:len(training_feature)] and \
        (feature[len(training_feature):].isnumeric() or feature[len(training_feature):] == ""):
            return True
    return False

training_features = list(filter(lambda x : is_training_feature(x, training_features_list), all_features))

target = "out"

## Missing data imputation

In this part we generate random missing data in the dataset and try to fill those using XGBoost.

### Missing data generation

Getting the range of all the date that should be in the dataset.

In [None]:
start_date = data_multiple.iloc[0].name
end_date = data_multiple.iloc[-1].name

dateRange = pd.date_range(start_date, end_date, freq='1h')
# dateRange

In [None]:
data_gap = data_multiple.copy()

Create missing data by removing random rows.

In [None]:
h = pd.Timedelta("1h")
row_number = len(data_gap)

def index_to_date(index_list):
    index_list = list(index_list)
    for i in range(len(index_list)):
        index_list[i] = start_date + index_list[i] * h
    return index_list

N_missing_data = 30000

index = index_to_date(np.random.randint(0, row_number, size=N_missing_data))
data_gap.drop(index=index, inplace=True, errors='ignore')

data_gap_filled = data_gap.copy()

### Training the model

Partition generation.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_multiple, shuffle=False)
# train

Creating and training the model.

In [None]:
args = {
    "n_estimators" : 1200,
    "learning_rate" : 0.01
}

reg = xgb.XGBRegressor(tree_method="gpu_hist",
                       **args)

t = time()
reg.fit(data_gap_filled[training_features], data_gap_filled[target],
        eval_set=[(data_gap_filled[training_features], data_gap_filled[target])],
        verbose=100)
print(time() - t)

### Missing data filling

Fill the missing data using the XGBoost model.

In [None]:
import re

mapping_dict = {}
for feature in all_available_features:
    for i in range(N, 0, -1):
        if (feature != "date") or i != 1:
            mapping_dict[feature + str(i)] = feature + str(i-1) if i > 1 else feature
# mapping_dict

In [None]:
t = time()

new_df = pd.DataFrame(data_gap_filled, copy=False)
new_df.drop(new_df.index, inplace=True)

for dt in dateRange.difference(data_gap_filled.index):
    last_row = data_gap_filled.loc[dt - h:dt - h].copy()
    
    if last_row.empty:
        last_row = new_df.loc[dt - h:dt - h].copy()
    new_row = last_row.copy()

    for k in mapping_dict:
        new_row[k] = new_row[mapping_dict[k]]

    new_row['date1'] = new_row.index
    new_row.index += h
    new_row = get_features(new_row)
    new_row['out'] = reg.predict(new_row[training_features])[0]

    new_df = pd.concat([new_df, new_row])
data_gap_filled = pd.concat([data_gap_filled, new_df])

print(time() - t)
data_gap_filled.loc[data_gap_filled['out'] == 0]

Score of the training using the RMSE (Root Mean Square Error) metric.

In [None]:
preds_train = reg.predict(train[training_features])
preds_test = reg.predict(test[training_features])

print("Training score:", reg.score(train[training_features], train[target]))
print("Testing score:", reg.score(test[training_features], test[target]))

Visualizing the model performance.

In [None]:
data_gap_filled = data_gap_filled.asfreq('1h')

In [None]:
period = '2018 05'

fig, ax = plt.subplots(figsize=(15, 5))

ax.set_title('Testing Data/Missing value generated')
ax.plot(data_gap_filled.loc[period].index, data_gap_filled.loc[period, 'out'], alpha=0.7, color="blue")
ax.plot(data_multiple.loc[period].index, data_multiple.loc[period, 'out'], alpha=0.7, color="red")
ax.legend(['Prediction', 'Testing Set'])

plt.show()

## Missing gaps

Before, the missing data were created using random index but in a real dataset the missing values might be multiple segment. So now the index will still be random but will be random segment of random size.

Creating the partition for the training and testing.

### Gap generation

Generating sgment gaps

In [None]:
data_gap = data_multiple.copy()
data_gap

row_number = len(data_gap)

def seg_to_index(N_seg, Min, Max):
    index_list = []

    for i in range(N_seg):
        size = np.random.randint(Min, Max)
        index = np.random.randint(N, row_number)
        
        for i in range(size):
            index_list.append(index + i)
    
    return index_list

N_missing_segment = 30
Min_size_segment = 50
Max_size_segment = 300

index = seg_to_index(N_missing_segment, Min_size_segment, Max_size_segment)
date_index = index_to_date(index)
data_gap.drop(index=date_index, inplace=True, errors='ignore')

data_gap_filled = data_gap.copy()
data_gap_filled
data_gap_filled.info()

### Training the model

In [None]:
from time import time

args = {
    "n_estimators" : 1000,
    "base_score" : 0.5,
    "max_depth" : 6,
    "learning_rate" : 0.01
}

reg = xgb.XGBRegressor(tree_method="gpu_hist",
                       **args)

t = time()
reg.fit(data_gap_filled[training_features], data_gap_filled[target],
        eval_set=[(data_gap_filled[training_features], data_gap_filled[target])],
        verbose=100)
print(time() - t)

### Predict the missing data

In [None]:
import re

mapping_dict = {}
for feature in all_available_features:
    for i in range(N, 0, -1):
        if (feature != "date") or i != 1:
            mapping_dict[feature + str(i)] = feature + str(i-1) if i > 1 else feature
# mapping_dict

In [None]:
startDate = data_multiple.iloc[0].name
endDate = data_multiple.iloc[-1].name

dateRange = pd.date_range(startDate, endDate, freq='1h')

dateRange.difference(data_gap_filled.index)

In [None]:
h = pd.Timedelta("1h")

from time import time
t = time()

new_df = pd.DataFrame(data_gap_filled, copy=False)
new_df.drop(new_df.index, inplace=True)

for dt in dateRange.difference(data_gap_filled.index):
    last_row = data_gap_filled.loc[dt - h:dt - h].copy()
    
    if last_row.empty:
        last_row = new_df.loc[dt - h:dt - h].copy()
    new_row = last_row.copy()

    for k in mapping_dict:
        new_row[k] = last_row[mapping_dict[k]]

    new_row['date1'] = new_row.index
    new_row.index += h
    new_row = get_features(new_row)
    new_row['out'] = reg.predict(new_row[training_features])[0]

    new_df = pd.concat([new_df, new_row])
data_gap_filled = pd.concat([data_gap_filled, new_df])

print(time() - t)
data_gap_filled.loc[data_gap_filled['out'] == 0]

In [None]:
data_gap_filled = data_gap_filled.asfreq('1h')
data_gap_filled.sort_index(inplace=True)

date_index contains dates where the data has been removed by a segment.

In [None]:
period = date_index[0:120]

fig, ax = plt.subplots(2, 1, figsize=(15, 10))

ax[0].set_title('Testing Data/Missing value generated')
ax[0].plot(data_gap_filled.loc[period].index, data_gap_filled.loc[period, 'out'], alpha=0.7, color="green")
ax[0].plot(data_multiple.loc[period].index, data_multiple.loc[period, 'out'], alpha=0.7, color="red")
ax[0].legend(['Prediction', 'Dataset'])


ax[1].set_title('gap between pred and real value')
ax[1].plot(data_gap_filled.loc[period].index, np.abs(data_gap_filled.loc[period, 'out'] - data_multiple.loc[period, 'out']))
ax[1].legend(['gap length'])

plt.show()