# Example Usage of the TimeSeriesImputer class

Prepare the environment

In [2]:
import pandas as pd
# load the class
from ts_imputer import TimeSeriesImputer

Let's read some time series data like freedom house scores. After a bit of preprocessing we get yearly country data.

In [3]:
fp_fh = 'Aggregate_Category_and_Subcategory_Scores_FIW_2003-2022.xlsx'
# read data
df_free = pd.read_excel(fp_fh, sheet_name='FIW06-22')

rename_cols = {
    'Country/Territory': 'country',
    'Edition': 'year',  # needs to be -1 in Data
    'PR Rating': 'fh_political_rights_rating',
    'CL Rating': 'fh_civil_liberties_rating',
    'A': 'fh_electoral_process',
    'B': 'fh_pol_pluralism_participation',
    'C': 'fh_gov_functioning',
    'PR': 'fh_political_rights_score',  # 40 max
    'D': 'fh_freedom_expression',
    'E': 'fh_assoc_org_rights',
    'F': 'fh_rule_of_law',
    'G': 'fh_individual_rights',
    'CL': 'fh_civil_liberties_score',  # 60 max
    'Total': 'fh_score'
}
df_free = df_free.dropna(axis=1, how='all').rename(columns=rename_cols)
df_free = df_free.drop(columns=[col for col in df_free.columns if col not in rename_cols.values()])

# some processing necessary:
df_free['year'] = df_free.year - 1  # FH Edition is given in the data, which refers to observations from the previous year
df_free = df_free.sort_values(['country', 'year'])
df_free['time'] = df_free.year.apply(lambda x: pd.Timestamp(x, 12, 31))

df_free

Unnamed: 0,country,year,fh_political_rights_rating,fh_civil_liberties_rating,fh_electoral_process,fh_pol_pluralism_participation,fh_gov_functioning,fh_political_rights_score,fh_freedom_expression,fh_assoc_org_rights,fh_rule_of_law,fh_individual_rights,fh_civil_liberties_score,fh_score,time
3343,Abkhazia,2005,5,5,5,5,3,13,8,4,4,5,21,34,2005-12-31
3136,Abkhazia,2006,5,5,5,5,3,13,8,4,4,5,21,34,2006-12-31
2929,Abkhazia,2007,5,5,5,6,4,15,8,4,4,5,21,36,2007-12-31
2720,Abkhazia,2008,5,5,5,5,4,14,8,4,4,5,21,35,2008-12-31
2512,Abkhazia,2009,5,5,5,5,4,14,7,5,4,5,21,35,2009-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047,Zimbabwe,2017,6,5,2,6,2,10,8,4,3,5,20,30,2017-12-31
838,Zimbabwe,2018,5,5,3,6,3,12,8,4,2,5,19,31,2018-12-31
629,Zimbabwe,2019,5,5,3,6,3,12,7,3,2,5,17,29,2019-12-31
419,Zimbabwe,2020,6,5,3,5,3,11,7,3,2,5,17,28,2020-12-31


Let's say we want to do a monthly model using freedom house data. First, create a monthly data structure while assigning the observation to the last month of each year. In my eyes, this is the logical choice for data usually published after the year is over. For other choices, the Imputer might have to be adapted somewhat.

Since the imputer works based on location and time information in the index, we need to create an appropriate multiindex in the process.

In [22]:
df_list = []
for country in df_free.country.unique():
    df_empty = pd.DataFrame(index=pd.date_range(pd.Timestamp(2005,1,31), pd.Timestamp(2021,12,31), freq='M', name='time'))
    # we run into some issues with inconsistent country naming over time, so we just don't use those for this example :)
    if len(df_free[df_free.country==country]) == 17:
        df_country = df_empty.merge(df_free[df_free.country==country].set_index('time'), how='left', left_index=True, right_index=True)
        df_country.country = df_country.country.bfill().ffill()
        df_list.append(df_country)
df = pd.concat(df_list).drop(columns='year').set_index('country', append=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,fh_political_rights_rating,fh_civil_liberties_rating,fh_electoral_process,fh_pol_pluralism_participation,fh_gov_functioning,fh_political_rights_score,fh_freedom_expression,fh_assoc_org_rights,fh_rule_of_law,fh_individual_rights,fh_civil_liberties_score,fh_score
time,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005-01-31,Abkhazia,,,,,,,,,,,,
2005-02-28,Abkhazia,,,,,,,,,,,,
2005-03-31,Abkhazia,,,,,,,,,,,,
2005-04-30,Abkhazia,,,,,,,,,,,,
2005-05-31,Abkhazia,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-31,Zimbabwe,,,,,,,,,,,,
2021-09-30,Zimbabwe,,,,,,,,,,,,
2021-10-31,Zimbabwe,,,,,,,,,,,,
2021-11-30,Zimbabwe,,,,,,,,,,,,


Now we have our dataframe with a location/time multiindex, lets see how the imputer works. First: simply backfilling the values for the whole year. In case of NaNs as most recent values, which we don't have in this example, the imputer uses the last seen value.

In [23]:
imp_bfill = TimeSeriesImputer(
    location_index='country',
    time_index='time',
    method='bfill'
)

df_imputed_bfill = imp_bfill.fit_transform(df)
df_imputed_bfill

Unnamed: 0_level_0,Unnamed: 1_level_0,fh_political_rights_rating,fh_civil_liberties_rating,fh_electoral_process,fh_pol_pluralism_participation,fh_gov_functioning,fh_political_rights_score,fh_freedom_expression,fh_assoc_org_rights,fh_rule_of_law,fh_individual_rights,fh_civil_liberties_score,fh_score
time,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005-01-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-02-28,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-03-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-04-30,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-05-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-31,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-09-30,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-10-31,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-11-30,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0


Second: The class also implements an interpolation option. For this, we can pass arguments to define the interpolation method (see the pandas.interpolate documentation for this) and the behavior at the tails, whether to extrapolate or fill the last seen value. The tail behavior can be passed as string, which is applied to both ends, or as tuple/list of length 2, to treat ends separately.

(Scroll down to Afghanistan to see the behavior in the output, there is no change in the Abkhazia data between 2005 and 2006 so interpolation results in the same values as filling)

In [24]:
imp_interp = TimeSeriesImputer(
    location_index='country',
    time_index='time',
    method='interpolate',
    interp_method='slinear',
    interp_tails= 'extrapolate' # alternative: e.g. ['extrapolate', 'fill']
)

df_imputed_interp = imp_interp.fit_transform(df)
df_imputed_interp

Unnamed: 0_level_0,Unnamed: 1_level_0,fh_political_rights_rating,fh_civil_liberties_rating,fh_electoral_process,fh_pol_pluralism_participation,fh_gov_functioning,fh_political_rights_score,fh_freedom_expression,fh_assoc_org_rights,fh_rule_of_law,fh_individual_rights,fh_civil_liberties_score,fh_score
time,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005-01-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-02-28,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-03-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-04-30,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
2005-05-31,Abkhazia,5.0,5.0,5.0,5.0,3.0,13.0,8.0,4.0,4.0,5.0,21.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-31,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-09-30,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-10-31,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0
2021-11-30,Zimbabwe,6.0,5.0,3.0,5.0,3.0,11.0,7.0,3.0,2.0,5.0,17.0,28.0


Lastly: The TimeSeriesImputer can simply be used as part of sklearn pipelines, as illustrated in this (nonsensical) example with a random target.

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np
X = df
y = pd.Series(np.random.choice([0,1], size=len(df)), index=df.index, name='target') # random dummy target

pipe = Pipeline(
    [('impute', imp_interp),
    ('model', RandomForestClassifier())]
)

pipe.fit(X, y)

Just for fun, lets check our AUROC performance:

In [26]:
from sklearn.metrics import roc_auc_score
y_hat = pipe.predict_proba(X)[:,1]
roc_auc_score(y, y_hat)

0.9021860111092829

Hopelessly overfitted ;)

Anyway, hope this helps someone sometime!