In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

import sys
sys.path.append("..") # in order to be able to use modules in src package

from src.data.load import load_data
from src.data.format import undummify, train_test, output_format
from src.metrics.error import apply_metrics

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


## Data preprocessing

In [2]:
df = load_data()
df

Unnamed: 0,Year,Count,Age0,Age1,Age2,Age3,Age4,Sex0,Sex1,Sex2,...,Country4_Prussia (Poland),Country4_Russia,Country4_Russian Empire (Ukraine),Country4_South Africa,Country4_Spain,Country4_Sweden,Country4_Switzerland,Country4_United Kingdom,Country4_United States of America,Country4_W&uuml;rttemberg (Germany)
0,1901,1,47.0,47.0,47.0,47.0,47.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1902,1,45.0,45.0,45.0,45.0,45.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1903,1,42.0,42.0,42.0,42.0,42.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1904,1,55.0,55.0,55.0,55.0,55.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
4,1905,1,61.0,61.0,61.0,61.0,61.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,2012,3,79.0,50.0,50.0,79.0,50.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
103,2013,5,63.0,64.0,64.0,57.0,57.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
104,2014,3,75.0,51.0,52.0,75.0,51.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
105,2015,3,85.0,80.0,84.0,85.0,80.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


## Model

In [3]:
train_x, train_y, test_x, test_y = train_test(df)

In [None]:
model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', max_depth=5, learning_rate=0.05, n_estimators=500))
model.fit(train_x, train_y)

In [None]:
y_hat = model.predict(test_x)

## Errors

In [None]:
errors = apply_metrics(y_hat, test_y, df.columns)
errors

## Predictions

In [None]:
df.iloc[-10:, :]

In [None]:
hist = df.iloc[-10:, 1:].to_numpy()
hist

In [None]:
preds = output_format(model.predict(hist), df.columns)
preds.insert(0, 'Year', list(range(2017,2027)))
preds