In [1]:
from library.common import Database, Core
from pathlib import Path
import pandas as pd

max_p = 20
max_d = 20
max_q = 20
score = False
plot = False
save_models = True
last_year = 2019

if score:
    out_of_sample_size=10
else:
    out_of_sample_size=0

cwd = Path.cwd()
model_dir = cwd.parent/r'models'
param_dir = cwd.parent/r'data/processed'

db = Database()
db_table = 'owid_co2_greenhouse_gas_emissions'
owid_df = pd.read_sql(db_table, db.config)
print("Data loaded")

core = Core()
# Clean up data
# NULL in ISO CODE are regional data
# Trim the dataset to remove regional and world aggregate
owid_df = owid_df.dropna(subset = ['iso_code']).copy()
owid_df = owid_df[~owid_df['iso_code'].isin(core.excluded_features)].copy()


Data loaded


In [2]:
def plot_ts(df, order, feature):
    print(__doc__)

    # Author: Taylor Smith <taylor.smith@alkaline-ml.com>

    import numpy as np
    import pmdarima as pm
    from pmdarima import model_selection
    from matplotlib import pyplot as plt

    print("pmdarima version: %s" % pm.__version__)

    # Load the data and split it into separate pieces
    y = df[feature]
    est = pm.ARIMA(order=order,
                   # seasonal_order=(0, 1, 1, 12),
                   suppress_warnings=True)
    cv = model_selection.SlidingWindowForecastCV(
        # window_size=10,
        # step=8,
        # h=4
    )
    predictions = model_selection.cross_val_predict(
        est, y, cv=cv, verbose=2, averaging="median")

    # plot the predictions over the original series
    x_axis = np.arange(y.shape[0])
    n_test = predictions.shape[0]
    plt.figure(figsize=(20, 10))
    plt.plot(x_axis, y, alpha=0.75, c='b')
    plt.plot(x_axis[-n_test:], predictions, alpha=0.5, c='r')  # Forecasts
    plt.title(f"Cross-validated {feature}")

    plt.show()

In [3]:
import pmdarima as pm
import pickle

# Setup path to store forecast parameters pickle files
full_path = param_dir/'params_forecast_params.pkl'
full_path = full_path.as_posix()
# Select the data needed for the time series analysis
selected_features = core.regression_features

# reduce the dataset based on the last year specified
dataset = owid_df[owid_df['year'].le(last_year)].copy()
forecast_param = {'last_period': dataset['year'].max()}
dataset = dataset[selected_features].copy()
print(selected_features)
# store the forecast param into the pickle file
with open(full_path, 'wb') as pkl:
    pickle.dump(forecast_param, pkl)

# Fill NAs wtih zero
dataset = dataset.fillna(0)

['iso_code', 'year', 'co2', 'consumption_co2', 'trade_co2', 'coal_co2', 'cement_co2', 'flaring_co2', 'gas_co2', 'oil_co2', 'other_industry_co2', 'methane', 'nitrous_oxide', 'population', 'gdp', 'primary_energy_consumption', 'co2_growth_prct', 'co2_growth_abs', 'co2_per_capita', 'consumption_co2_per_capita', 'share_global_co2', 'cumulative_co2', 'share_global_cumulative_co2', 'co2_per_gdp', 'consumption_co2_per_gdp', 'co2_per_unit_energy', 'cement_co2_per_capita', 'coal_co2_per_capita', 'flaring_co2_per_capita', 'gas_co2_per_capita', 'oil_co2_per_capita', 'other_co2_per_capita', 'trade_co2_share', 'share_global_cement_co2', 'share_global_coal_co2', 'share_global_flaring_co2', 'share_global_gas_co2', 'share_global_oil_co2', 'share_global_other_co2', 'cumulative_cement_co2', 'cumulative_coal_co2', 'cumulative_flaring_co2', 'cumulative_gas_co2', 'cumulative_oil_co2', 'cumulative_other_co2', 'share_global_cumulative_cement_co2', 'share_global_cumulative_coal_co2', 'share_global_cumulative_f

In [4]:
def get_optimal_param(timeseries):
    return pm.auto_arima(timeseries,
                         max_p= max_p, max_d= max_d, max_q= max_q,
                         seasonal=False, error_action='ignore', suppress_warnings=True,
                         trace =  False,
                         out_of_sample_size = out_of_sample_size,
                         stepwise=True)  # set to stepwise

regions = core.regions
world = core.world

for region in core.list_of_regions:
    if core.regions.get(region) == [world]:
        # Regional grouping is the world, then take the entire dataset
        regional_df = dataset.copy()
    else:
        # Take the countries specified in the dictionary
        regional_df = dataset[dataset['iso_code'].isin(regions.get(region))].copy()

    regional_df = regional_df.groupby('year').sum().reset_index(drop = False)

    result = dict()
    # Create optimised model for each feature
    # Note that the first two features are ISO_CODE and Year - these are not features for the time series
    for feature in selected_features[2:]:
        series = regional_df[feature].to_numpy()
        optimal = get_optimal_param(series)
        model_name = r'model-'+region+'-'+feature+r'.pkl'

        if save_models:
            full_path = model_dir/model_name
            full_path = full_path.as_posix()
            with open(full_path, 'wb') as pkl:
                pickle.dump(optimal, pkl)

        print(model_name, 'COMPLETED')
        if score:
            print('SCORE')
            print(optimal.get_params().get('order'))
            print(optimal.scoring)
            print(optimal.predict(out_of_sample_size))
        elif plot:
            plot_ts(regional_df, optimal.get_params().get('order'), feature)
print("Timeseries models creation COMPLETED")



model-global-co2.pkl COMPLETED
model-global-consumption_co2.pkl COMPLETED
model-global-trade_co2.pkl COMPLETED
model-global-coal_co2.pkl COMPLETED
model-global-cement_co2.pkl COMPLETED
model-global-flaring_co2.pkl COMPLETED
model-global-gas_co2.pkl COMPLETED
model-global-oil_co2.pkl COMPLETED
model-global-other_industry_co2.pkl COMPLETED
model-global-methane.pkl COMPLETED
model-global-nitrous_oxide.pkl COMPLETED
model-global-population.pkl COMPLETED
model-global-gdp.pkl COMPLETED
model-global-primary_energy_consumption.pkl COMPLETED
model-global-co2_growth_prct.pkl COMPLETED
model-global-co2_growth_abs.pkl COMPLETED
model-global-co2_per_capita.pkl COMPLETED
model-global-consumption_co2_per_capita.pkl COMPLETED
model-global-share_global_co2.pkl COMPLETED
model-global-cumulative_co2.pkl COMPLETED
model-global-share_global_cumulative_co2.pkl COMPLETED
model-global-co2_per_gdp.pkl COMPLETED
model-global-consumption_co2_per_gdp.pkl COMPLETED
model-global-co2_per_unit_energy.pkl COMPLETED
mo