<u>Module imports</u>

In [350]:
# Import modules

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from modAL.models import ActiveLearner

<u>Data import</u>

In [351]:
# Read data into DataFrames

# These two groups of data will be used for label making
# Emissions data
nrg_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2 Emissions from Energy", header=2, index_col=0)
# The sheet called "Natural Gas Flaring" is already a part of the calculations for the sheet called "CO2 from Flaring"
flar_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2 from Flaring", header=2, index_col=0)
equi_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2e Methane, Process emissions", header=2, index_col=0)

# Renewable energy production data
hydro_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Hydro Generation - TWh", header=2, index_col=0)
solar_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Solar Generation - TWh", header=2, index_col=0)
wind_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Wind Generation - TWh", header=2, index_col=0)
geo_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Geo Biomass Other - TWh", header=2, index_col=0)
bio_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Biofuels production - PJ", header=2, index_col=0, nrows=47)

# These three groups of data will be used for the feature sets
# Renewable energy consumption data
hydro_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Hydro Consumption - EJ", header=2, index_col=0)
solar_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Solar Consumption - EJ", header=2, index_col=0)
wind_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Wind Consumption - EJ", header=2, index_col=0)
geo_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Geo Biomass Other - EJ", header=2, index_col=0)
bio_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Biofuels consumption - PJ", header=2, index_col=0, nrows=47)

# Non-renewable energy consumption data
oil_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Oil Consumption - EJ", header=2, index_col=0)
gas_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Gas Consumption - EJ", header=2, index_col=0)
coal_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Coal Consumption - EJ", header=2, index_col=0)
nuc_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Nuclear Consumption - EJ", header=2, index_col=0)

# Total energy consumption data
tol_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Primary Energy Consumption", header=2, index_col=0)

<u>Programmatic data processing</u>

In [352]:
def processData(df:pd.DataFrame, flag=False):
    """
    Get an excel sheet ready for conversion to numpy arrays.

    Parameters:
    - df (pd.DataFrame): a dataframe containing an excel sheet
    - flag (boolean): an indicator to convert PJ to EJ instead of kWh
    """
    #------------------------------ 
    # Remove all irrelevant columns
    #------------------------------

    # Remove all data from before 1990
    # Find the index of the "1990" column
    drop_indx = list(df.columns).index(1990)
    # Get the column labels of all columns left of "1990"
    drop_cols = [df.columns[num] for num in np.arange(0, drop_indx)]
    df = df.drop(columns=drop_cols)

    # Remove data on growth-rate and share
    # Get the column labels of the target columns
    drop_cols = [df.columns[num] for num in [-3, -2, -1]]
    df = df.drop(columns=drop_cols)

    #---------------------------
    # Remove all irrelevant rows
    #---------------------------

    # Remove all rows with any empty cells
    # 0 doesn't make an empty cell
    df = df.dropna()

    # Remove all "Total" and "Other" rows
    # In addition, OECD, Non-OECD, the EU, and the USSR
    # In addition, 8 other countries because they only appear in excel sheets for
    # flaring emissions and nothing else. I can't make data samples for non-existent data
    # Rationale for removing "Other" rows - some countries in some excel sheets appear
    # individually, but are lumped into an "Other" row in other sheets.
    # There's no possible way for me to know which portions of an
    # "Other" row value belongs to which countries.
    drop_rows = []
    keywords = ["Total", "Other", "OECD", "European Union", "USSR", "Bolivia", 
                "Bahrain", "Syria", "Yemen", "Libya", "Nigeria", "Brunei", "Myanmar"]
    for row in df.index:
        # Mark a row for dropping if it contains any of the keywords
        if any(keyword in row for keyword in keywords):
            drop_rows.append(row)
    df = df.drop(index=drop_rows)

    # -----------------
    # Convert the units
    # -----------------

    # This section is only performed on emissions data, 
    # renewable energy production data, and consumed
    # biofuel energy data
    # All other dataframes have "Exajoules" as their name
    
    # All CO2 data is currently represented as millions of tonnes
    # Convert all produced renewable data to kilowatt-hour (kWh)
    # 1 kWh = 3600 kJ
    # 1 PJ = 1000000000000 kJ
    # 1 TWh = 1000000000 kWh

    if (df.index.name) == "Million tonnes of carbon dioxide":
        # Convert to single tonnes
        df = df * 1000000
    elif df.index.name == "Terawatt-hours":
        # Convert to kilowatt-hours
        df = df * 1000000000
    elif df.index.name == "Petajoules":
        if flag:
            # Convert to exajoules
            f = df * 0.001
        else:
            # Convert to kilowatt-hours
            df = df * (1000000000000/3600)

    return df

# tonnes = metric ton = 1000 kg


In [353]:
def rowIndices(df:pd.DataFrame):
    """
    Return the row labels of a pd.DataFrame

    Parameters:
    - df (pd.DataFrame): a dataframe containing an excel sheet
    """

    return [row for row in df.index]


In [354]:
# Process dataframes

# Unit: Tonnes
nrg_emi_df = processData(nrg_emi_df)
flar_emi_df = processData(flar_emi_df)
equi_emi_df = processData(equi_emi_df)

# Unit: Kilowatt-hours
hydro_pro_df = processData(hydro_pro_df)
solar_pro_df = processData(solar_pro_df)
wind_pro_df = processData(wind_pro_df)
geo_pro_df = processData(geo_pro_df)
bio_pro_df = processData(bio_pro_df)

# Unit: Exajoules
hydro_con_df = processData(hydro_con_df)
solar_con_df = processData(solar_con_df)
wind_con_df = processData(wind_con_df)
geo_con_df = processData(geo_con_df)
bio_con_df = processData(bio_con_df, True)

# Unit: Exajoules
oil_con_df = processData(oil_con_df)
gas_con_df = processData(gas_con_df)
coal_con_df = processData(coal_con_df)
nuc_con_df = processData(nuc_con_df)

# Unit: Exajoules
tol_con_df = processData(tol_con_df)

In [355]:
# Convert to numpy arrays

nrg_emi = nrg_emi_df.to_numpy()
flar_emi = flar_emi_df.to_numpy()
equi_emi = equi_emi_df.to_numpy()
hydro_p = hydro_pro_df.to_numpy()
solar_p = solar_pro_df.to_numpy()
wind_p = wind_pro_df.to_numpy()
geo_p = geo_pro_df.to_numpy()
bio_p = bio_pro_df.to_numpy()

hydro_c = hydro_con_df.to_numpy()
solar_c = solar_con_df.to_numpy()
wind_c = wind_con_df.to_numpy()
geo_c = geo_con_df.to_numpy()
bio_c = bio_con_df.to_numpy()
oil_c = oil_con_df.to_numpy()
gas_c = gas_con_df.to_numpy()
coal_c = coal_con_df.to_numpy()
nuc_c = nuc_con_df.to_numpy()

tol_c = tol_con_df.to_numpy()

# print(len(rowIndices(hydro_con_df)), len(rowIndices(solar_con_df)), len(rowIndices(wind_con_df)), len(rowIndices(geo_con_df)), len(rowIndices(bio_con_df)))
# print(len(rowIndices(oil_con_df)), len(rowIndices(gas_con_df)), len(rowIndices(coal_con_df)), len(rowIndices(nuc_con_df)))
# print(len(rowIndices(tol_con_df)))

# Get row indices of dataframes
# There are three unique indices/list of countries

# All of these dataframes (and their NDarray equivalents) have 83 indices.
# Their row indices are shown in nrg_emi_indices
#
# nrg_emi_df
# equi_emi_df
# hydro_pro_df
# solar_pro_df
# wind_pro_df
# geo_pro_df
# hydro_con_df
# solar_con_df
# wind_con_df
# geo_con_df
# oil_con_df
# gas_con_df
# coal_con_df
# nuc_con_df
# tol_con_df

# All of these dataframes (and their NDarray equivalents) have 41 indices
# Their row indices are shown in flar_emi_indices
# 
# flar_emi_df

# All of these dataframes (and their NDarray equivalents) have 24 indices
# Their row indices are shown in bio_indices
#
# bio_pro_df
# bio_con_df

# The rest of the dataframes share the same index list as nrg_emi_indices
nrg_emi_indices = rowIndices(nrg_emi_df)
flar_emi_indices = rowIndices(flar_emi_df)
bio_indices = rowIndices(bio_pro_df)

<b>nrg_emi_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru', 'Trinidad & Tobago', 'Venezuela', 'Central America', 'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom', 'Azerbaijan', 'Belarus', 'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'Uzbekistan', 'Iran', 'Iraq', 'Israel', 'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates', 'Algeria', 'Egypt', 'Morocco', 'South Africa', 'Eastern Africa', 'Middle Africa', 'Western Africa', 'Australia', 'Bangladesh', 'China', 'China Hong Kong SAR', 'India', 'Indonesia', 'Japan', 'Malaysia', 'New Zealand', 'Pakistan', 'Philippines', 'Singapore', 'South Korea', 'Sri Lanka', 'Taiwan', 'Thailand', 'Vietnam']

<b>flar_emi_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Colombia', 'Peru', 'Trinidad & Tobago', 'Venezuela', 'Denmark', 'Germany', 'Italy', 'Netherlands', 'Norway', 'Poland', 'Romania', 'Ukraine', 'United Kingdom', 'Azerbaijan', 'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'Uzbekistan', 'Iran', 'Iraq', 'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates', 'Algeria', 'Egypt', 'Australia', 'Bangladesh', 'China', 'India', 'Indonesia', 'Malaysia', 'Pakistan', 'Thailand', 'Vietnam']

<b>biofuel_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Colombia', 'Austria', 'Belgium', 'Finland', 'France', 'Germany', 'Italy', 'Netherlands', 'Poland', 'Portugal', 'Spain', 'Sweden', 'United Kingdom', 'Australia', 'China', 'India', 'Indonesia', 'South Korea', 'Thailand']

<b>Shape of nrg_emi:</b>

(83, 33)

<b>Columns of every dataframe:</b>

Index([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
      dtype='object')


In [356]:
# Massive 3D numpy array for label making
# 1st dimension - Years. 33 years from 1990-2022 (inclusive)
# 2nd dimension - Countries/Regions. 83 unique countries/regions
# 3rd dimension - Carbon Neutral features. 8 features (in this order): energy emissions, flaring emissions, CO2 equivalent emissions, 
# hydroelectric production, solar production, wind production, geothermal production, biofuel production
# (33, 83, 8)

# Find every unique country/region
# This is a bit redundant because every country in flar_emi_indices and bio_indices 
# is already in nrg_emi_indices
cotry_reg = list(set(nrg_emi_indices + flar_emi_indices + bio_indices))
cotry_reg.sort()
print(cotry_reg)

dim_1 = []
for year_indx in range(33):
    dim_2 = []
    for area in cotry_reg:
        # There's no area check for the upcoming data
        # because every area has this data
        indx = nrg_emi_indices.index(area)
        # Extract a float
        a_nrg_emi = nrg_emi[indx][year_indx]
        a_equi_emi = equi_emi[indx][year_indx]
        a_hydro = hydro_p[indx][year_indx]
        a_solar = solar_p[indx][year_indx]
        a_wind = wind_p[indx][year_indx]
        a_geo = geo_p[indx][year_indx]

        if area in flar_emi_indices:
            indx = flar_emi_indices.index(area)
            # Extract a float
            a_flar_emi = flar_emi[indx][year_indx]
        else:
            a_flar_emi = 0.

        if area in bio_indices:
            indx = bio_indices.index(area)
            # Extract a float
            a_biofuel = bio_p[indx][year_indx]
        else:
            a_biofuel = 0.

        # Is also a set of features
        dim_3 = [a_nrg_emi,
                a_flar_emi,
                a_equi_emi,
                a_hydro,
                a_solar,
                a_wind,
                a_geo,
                a_biofuel]
        dim_2.append(dim_3)
    dim_1.append(dim_2)

# Label Statistical Review 
# Full of floats
lsr = np.array(dim_1)
print(f"lsr shape: {lsr.shape}")


['Algeria', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Brazil', 'Bulgaria', 'Canada', 'Central America', 'Chile', 'China', 'China Hong Kong SAR', 'Colombia', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Eastern Africa', 'Ecuador', 'Egypt', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico', 'Middle Africa', 'Morocco', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russian Federation', 'Saudi Arabia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Trinidad & Tobago', 'Turkey', 'Turkmenistan', 'US', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'Uzbekistan', 'Venezu

In [357]:
# Massive 3D numpy array for classification
# 1st dimension - Years. 33 years from 1990-2022 (inclusive)
# 2nd dimension - Countries/Regions. 83 unique countries/regions
# 3rd dimension - Energy Consumption features. 9 features (in this order): oil, gas, coal, nuclear, 
# hydroelectric, solar, wind, geothermal, biofuel
# (33, 83, 9)

print(cotry_reg)

dim_1 = []
for year_indx in range(33):
    dim_2 = []
    for area in cotry_reg:
        # There's no area check for the upcoming data
        # because every area has this data
        indx = nrg_emi_indices.index(area)
        # Extract a float
        tol_consume = tol_c[indx][year_indx]
        a_hydro = hydro_c[indx][year_indx] / tol_consume
        a_solar = solar_c[indx][year_indx] / tol_consume
        a_wind = wind_c[indx][year_indx] / tol_consume
        a_geo = geo_c[indx][year_indx] / tol_consume
        a_oil = oil_c[indx][year_indx] / tol_consume
        a_gas = gas_c[indx][year_indx] / tol_consume
        a_coal = coal_c[indx][year_indx] / tol_consume
        a_nuc = nuc_c[indx][year_indx] / tol_consume

        if area in bio_indices:
            indx = bio_indices.index(area)
            # Extract a float
            a_biofuel = bio_c[indx][year_indx] / tol_consume
        else:
            a_biofuel = 0.

        # Is also a set of features
        dim_3 = [a_oil, a_gas, a_coal, a_nuc,
                 a_hydro, a_solar, a_wind, a_geo, a_biofuel]
        dim_2.append(dim_3)
    dim_1.append(dim_2)

# Classification Statistical Review 
# Full of floats
csr = np.array(dim_1)
print(f"csr shape: {csr.shape}")

# Concatenated version
ccsr = np.concatenate(csr, axis=0)
print(f"clabels shape: {ccsr.shape}") 

['Algeria', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Brazil', 'Bulgaria', 'Canada', 'Central America', 'Chile', 'China', 'China Hong Kong SAR', 'Colombia', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Eastern Africa', 'Ecuador', 'Egypt', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico', 'Middle Africa', 'Morocco', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russian Federation', 'Saudi Arabia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Trinidad & Tobago', 'Turkey', 'Turkmenistan', 'US', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'Uzbekistan', 'Venezu

In [358]:
def makeLabel(features):
    """
    Make a label for a sample

    - features (np.ndarray): set of 8 features
    """
    # Unit: Tonnes of Carbon Dioxide
    co2 = np.sum(features[:3])
    # Unit: Kilowatt-hours
    renewable = np.sum(features[3:])

    # Electricity reductions emission factor
    # 0.000709 tonnes CO2/kWh
    # Unit: Tonnes of Carbon Dioxide
    renewable *= 0.000709

    # Remaining co2 after being offset by renewable energy production
    rem_co2 = max(co2 - renewable, 0)

    # if rem_co2 == co2:
    #     return 6
    if rem_co2 == 0:
        return 0
    else:
        percent = (rem_co2/co2) * 100
        # Equivalent to np.floor(percent / 10)
        # The label is the tens place of the percentage
        # return int(np.floor(percent / 10))
        if percent > 0.0 and percent <= 20.0:
            label = 1
        elif percent > 20.0 and percent <= 40.0:
            label = 2
        elif percent > 40.0 and percent <= 60.0:
            label = 3
        elif percent > 60.0 and percent <= 80.0:
            label = 4
        elif percent > 80.0 and percent <= 100.0:
            label = 5
        else:
            label = 5
        return label


In [359]:
# Make labels for all of the data/samples/examples 
# An individual feature isn't a example, but a location in a particular year is
# Thus, there are 33 * 83 = 2739 examples

# There are 6 possible labels, 0-5
# 0 means carbon neutral is achieved
# 6 means the country is absolutely nowhere near carbon neutrality
labels = np.array([[makeLabel(location) for location in year] for year in lsr])
print(f"labels shape: {labels.shape}")

# Concatenated version
clabels = np.concatenate(labels, axis=0)
print(f"clabels shape: {clabels.shape}")

labels shape: (33, 83)
clabels shape: (2739,)


In [360]:
# Split the big dataset into a three subsets: labeled training, unlabeled training, and test

# Labeled training subset
# Years 1990-1991
lab_set = csr[0:2]
lab_set_label = labels[0:2]

# Unlabeled training subset
# aka Pool
# Years 1992-2020
unlab_set = csr[2:31]
unlab_set_label = labels[2:31]

# Test subset
# Years 2021-2022
test_set = csr[31::]
test_set_label = labels[31::]


# Concatenated versions
clab_set = np.concatenate(lab_set, axis=0)
clab_set_label = np.concatenate(lab_set_label, axis=0)

cunlab_set = np.concatenate(unlab_set, axis=0)
cunlab_set_label = np.concatenate(unlab_set_label, axis=0)

ctest_set = np.concatenate(test_set, axis=0)
ctest_set_label = np.concatenate(test_set_label, axis=0)


print("Labeled training subset, Years 1990-1991")
print(f"lab_set shape: {lab_set.shape}")
print(f"lab_set_label shape: {lab_set_label.shape}")
print(f"clab_set shape: {clab_set.shape}")
print(f"clab_set_label shape: {clab_set_label.shape}\n")

print("Unlabeled training subset, Years 1992-2020")
print(f"unlab_set shape: {unlab_set.shape}")
print(f"unlab_set_label shape: {unlab_set_label.shape}")
print(f"cunlab_set shape: {cunlab_set.shape}")
print(f"cunlab_set_label shape: {cunlab_set_label.shape}\n")

print("Test subset, Years 2021-2022")
print(f"test_set shape: {test_set.shape}")
print(f"test_set_label shape: {test_set_label.shape}")
print(f"ctest_set shape: {ctest_set.shape}")
print(f"ctest_set_label shape: {ctest_set_label.shape}")

Labeled training subset, Years 1990-1991
lab_set shape: (2, 83, 9)
lab_set_label shape: (2, 83)
clab_set shape: (166, 9)
clab_set_label shape: (166,)

Unlabeled training subset, Years 1992-2020
unlab_set shape: (29, 83, 9)
unlab_set_label shape: (29, 83)
cunlab_set shape: (2407, 9)
cunlab_set_label shape: (2407,)

Test subset, Years 2021-2022
test_set shape: (2, 83, 9)
test_set_label shape: (2, 83)
ctest_set shape: (166, 9)
ctest_set_label shape: (166,)


<u>Classification Plan</u>
- 1st Training phase: Train the classifier on the first three labeled years of the data
- 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most information will be chosen to get their true label. The remaining examples will get pseudo-labeled
- 1st Evaluate phase: Use the newly trained classifier to evaluate the data from 2021 and 2022
- Predict phase: Use time series forecasting (RNN) to predict a country's set of features until 2050
- 2nd Evaluate phase: Use the classifier to predict levels of CN

In [361]:
# Batch Active Learning
def batch_active_learning(classifier: ActiveLearner, unlab_data, unlab_lab, all_data, all_labels, n_queries):
    """Train a classifier using batch active learning
    
    Parameters:
    - classifier (ActiveLearner): a classifier from the scikit-learn (sklearn) module 
    - unlab_data (ndarray): the unlabeled dataset, shape=(29, 83, 9)
    - unlab_lab (ndarray): the unlabled dataset's labels, shape=(29, 83)
    - add_data (ndarray): all data in concatenated form, shape=(2739, 9)
    - all_labels (ndarray): all data labels in concatenated form, shape=(2739,)
    - n_queries (int): number of queries to make on each year of data
    """

    count = 1
    for year_data, year_label in zip(unlab_data, unlab_lab):
        # year_data.shape = (83, 9)
        # year_label.shape = (83,)
        for _ in range(n_queries):
            # Query based on uncertainty
            query_index, _ = classifier.query(year_data)

            # Retrieve the requested example and its label, and teach it to the classifier
            example = year_data[query_index].reshape(1, -1)
            example_label = year_label[query_index].reshape(1,)
            # example.shape = (1, 9)
            # example_label.shape = (9,)
            classifier.teach(X=example, y=example_label)

            # Remove the queried example and its label from the unlabeled datasets
            year_data = np.delete(year_data, query_index, axis=0)
            year_label = np.delete(year_label, query_index)

            accuracy = classifier.score(all_data, all_labels)
            print(f"Accuracy after query {count}: {round(accuracy, 4)}")

            count += 1
    

In [362]:
# Batch active learning hyperparameters aka model parameters
# These are different from real model parameters that are estimated by the model itself

n_estimators = 1000
criterion = "log_loss"
max_depth = 50
n_queries = 10

In [363]:
# 1st Training phase: Train a classifier on the first two labeled years of the data
# 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most 


# Gaussian Naive Bayes isn't an option because the data distribution isn't gaussian/normal due to lacking a "symmetric bell shape". 
# Most of the data labels are on the high end of the scale. Thus, the data's bell shape isn't symmetric
# Bernoulli Naive Bayes isn't an option because sample features must be binary-valued (Bernoulli, boolean)
# Multinomial, Complement, and Categorical aren't considered  due to data being classified moreso out of probability rather than certainty.

rf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)

# classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

# Start the classifier off by training it on the labeled dataset
classifier = ActiveLearner(estimator=rf, X_training=clab_set, y_training=clab_set_label)

batch_active_learning(classifier, np.copy(unlab_set), np.copy(unlab_set_label), ccsr, clabels, n_queries)

Accuracy after query 1: 0.8488
Accuracy after query 2: 0.8543
Accuracy after query 3: 0.8576
Accuracy after query 4: 0.8583
Accuracy after query 5: 0.8594
Accuracy after query 6: 0.8602
Accuracy after query 7: 0.8616
Accuracy after query 8: 0.8616
Accuracy after query 9: 0.862
Accuracy after query 10: 0.8631
Accuracy after query 11: 0.8602
Accuracy after query 12: 0.862
Accuracy after query 13: 0.8613
Accuracy after query 14: 0.8664
Accuracy after query 15: 0.8671
Accuracy after query 16: 0.8656
Accuracy after query 17: 0.8656
Accuracy after query 18: 0.8675
Accuracy after query 19: 0.8671
Accuracy after query 20: 0.8671
Accuracy after query 21: 0.8715
Accuracy after query 22: 0.8719
Accuracy after query 23: 0.8751
Accuracy after query 24: 0.8817
Accuracy after query 25: 0.8828
Accuracy after query 26: 0.8788
Accuracy after query 27: 0.877
Accuracy after query 28: 0.8781
Accuracy after query 29: 0.881
Accuracy after query 30: 0.8799
Accuracy after query 31: 0.8817
Accuracy after query 

In [364]:
joblib.dump(classifier, "models/al_rand_forest2.pkl")

['models/al_rand_forest2.pkl']

In [365]:
# 1st Evaluate phase: Use the newly trained classifier to evaluate the data from 2021 and 2022
clf = joblib.load("models/al_rand_forest2.pkl")
accuracy = clf.score(np.concatenate(test_set, axis=0), np.concatenate(test_set_label, axis=0))
accuracy

0.8493975903614458

In [166]:
print(set(labels[0]))
print(set(labels[1]))
print(set(labels[2]))

print(set(lab_set_label[0]))
print(set(lab_set_label[1]))
print(set(lab_set_label[2]))

{0, 1, 2, 3, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 4, 5, 6, 7, 8, 9, 10}


In [53]:
# RNN classifier class
class RecurrentNeuralNetwork(nn.Module):
    def __init__(self):
        super(RecurrentNeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(in_features=8, out_features=16)
        # nn.ReLU() doesn't need parameters in this case
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=16, out_features=16)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(in_features=16, out_features=16)
        self.activation3 = nn.ReLU()
        # self.batchNorm = nn.BatchNorm1d()
        # self.flatten = nn.Flatten()
        # self.dropout1 = nn.Dropout()
        self.dense1 = nn.Linear(in_features=16, out_features=1)
        # self.dropout2 = nn.Dropout()
        # self.dense2 = nn.Linear()
        # self.dropout3 = nn.Dropout()
        # self.dense3 = nn.Linear()
        # self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        x = self.activation3(x)
        # x = self.batchNorm(x)
        # x = self.flatten(x)
        # x = self.dropout1(x)
        x = self.dense1(x)
        # x = self.dropout2(x)
        # x = self.dense2(x)
        # x = self.dropout3(x)
        # x = self.dense3(x)

        return x

In [59]:
device = "cuda" if torch.cuda.is_available() else "cpu"
RNN = RecurrentNeuralNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(RNN.parameters(), lr=0.0001)

In [None]:
# Time series forecasting