In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, accuracy_score

import sys
sys.path.append("..") # in order to be able to use modules in src package

from src.data.load import load_data
from src.data.format import undummify, train_test, output_format
from src.metrics.error import apply_metrics

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


## Data preprocessing

In [2]:
df = load_data(category='Literature')
df

Unnamed: 0,Year,Count,Age0,Age1,Age2,Age3,Age4,Sex0,Sex1,Sex2,...,Country4_Spain,Country4_Sweden,Country4_Switzerland,Country4_Trinidad,Country4_Turkey,Country4_Tuscany (Italy),Country4_Ukraine,Country4_Union of Soviet Socialist Republics (Russia),Country4_United Kingdom,Country4_United States of America
0,1901,1,62.0,62.0,62.0,62.0,62.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1902,1,85.0,85.0,85.0,85.0,85.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1903,1,71.0,71.0,71.0,71.0,71.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1904,2,74.0,72.0,74.0,72.0,74.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1905,1,59.0,59.0,59.0,59.0,59.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,2012,1,57.0,57.0,57.0,57.0,57.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
105,2013,1,82.0,82.0,82.0,82.0,82.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
106,2014,1,69.0,69.0,69.0,69.0,69.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
107,2015,1,67.0,67.0,67.0,67.0,67.0,1.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0


## Model

In [3]:
train_x, train_y, test_x, test_y = train_test(df)

In [None]:
model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', max_depth=5, learning_rate=0.05, n_estimators=500))
model.fit(train_x, train_y)

In [None]:
y_hat = model.predict(test_x)

## Errors

In [None]:
errors = apply_metrics(y_hat, test_y, df.columns)
errors

## Predictions

In [None]:
df.iloc[-10:, :]

In [None]:
hist = df.iloc[-10:, 1:].to_numpy()
hist

In [None]:
preds = output_format(model.predict(hist), df.columns)
preds.insert(0, 'Year', list(range(2017,2027)))
preds