<u>Module imports</u>

In [1068]:
# Import modules

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier, AdaBoostClassifier

<u>Data import</u>

In [1069]:
# Read data into DataFrames

# These two groups of data will be used for label making
# Emissions data
nrg_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2 Emissions from Energy", header=2, index_col=0)
# The sheet called "Natural Gas Flaring" is already a part of the calculations for the sheet called "CO2 from Flaring"
flar_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2 from Flaring", header=2, index_col=0)
equi_emi_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="CO2e Methane, Process emissions", header=2, index_col=0)

# Renewable energy production data
hydro_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Hydro Generation - TWh", header=2, index_col=0)
solar_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Solar Generation - TWh", header=2, index_col=0)
wind_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Wind Generation - TWh", header=2, index_col=0)
geo_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Geo Biomass Other - TWh", header=2, index_col=0)
bio_pro_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Biofuels production - PJ", header=2, index_col=0, nrows=47)

# These three groups of data will be used for the feature sets
# Renewable energy consumption data
hydro_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Hydro Consumption - EJ", header=2, index_col=0)
solar_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Solar Consumption - EJ", header=2, index_col=0)
wind_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Wind Consumption - EJ", header=2, index_col=0)
geo_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Geo Biomass Other - EJ", header=2, index_col=0)
bio_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Biofuels consumption - PJ", header=2, index_col=0, nrows=47)

# Non-renewable energy consumption data
oil_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Oil Consumption - EJ", header=2, index_col=0)
gas_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Gas Consumption - EJ", header=2, index_col=0)
coal_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Coal Consumption - EJ", header=2, index_col=0)
nuc_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Nuclear Consumption - EJ", header=2, index_col=0)

# Total energy consumption data
tol_con_df = pd.read_excel(io="data/Statistical Review of World Energy Data.xlsx", sheet_name="Primary Energy Consumption", header=2, index_col=0)

<u>Programmatic data processing</u>

In [1070]:
def processData(df:pd.DataFrame, flag=False):
    """
    Get an excel sheet ready for conversion to numpy arrays.

    Parameters:
    - df (pd.DataFrame): a dataframe containing an excel sheet
    - flag (boolean): an indicator to convert PJ to EJ instead of kWh
    """
    #------------------------------ 
    # Remove all irrelevant columns
    #------------------------------

    # Remove all data from before 1990
    # Find the index of the "1990" column
    drop_indx = list(df.columns).index(1990)
    # Get the column labels of all columns left of "1990"
    drop_cols = [df.columns[num] for num in np.arange(0, drop_indx)]
    df = df.drop(columns=drop_cols)

    # Remove data on growth-rate and share
    # Get the column labels of the target columns
    drop_cols = [df.columns[num] for num in [-3, -2, -1]]
    df = df.drop(columns=drop_cols)

    #---------------------------
    # Remove all irrelevant rows
    #---------------------------

    # Remove all rows with any empty cells
    # 0 doesn't make an empty cell
    df = df.dropna()

    # Remove all "Total" and "Other" rows
    # In addition, OECD, Non-OECD, the EU, and the USSR
    # In addition, 8 other countries because they only appear in excel sheets for
    # flaring emissions and nothing else. I can't make data samples for non-existent data
    # Rationale for removing "Other" rows - some countries in some excel sheets appear
    # individually, but are lumped into an "Other" row in other sheets.
    # There's no possible way for me to know which portions of an
    # "Other" row value belongs to which countries.
    drop_rows = []
    keywords = ["Total", "Other", "OECD", "European Union", "USSR", "Bolivia", 
                "Bahrain", "Syria", "Yemen", "Libya", "Nigeria", "Brunei", "Myanmar"]
    for row in df.index:
        # Mark a row for dropping if it contains any of the keywords
        if any(keyword in row for keyword in keywords):
            drop_rows.append(row)
    df = df.drop(index=drop_rows)

    # -----------------
    # Convert the units
    # -----------------

    # This section is only performed on emissions data, 
    # renewable energy production data, and consumed
    # biofuel energy data
    # All other dataframes have "Exajoules" as their name
    
    # All CO2 data is currently represented as millions of tonnes
    # Convert all produced renewable data to kilowatt-hour (kWh)
    # 1 kWh = 3600 kJ
    # 1 PJ = 1000000000000 kJ
    # 1 TWh = 1000000000 kWh

    if (df.index.name) == "Million tonnes of carbon dioxide":
        # Convert to single tonnes
        df = df * 1000000
    elif df.index.name == "Terawatt-hours":
        # Convert to kilowatt-hours
        df = df * 1000000000
    elif df.index.name == "Petajoules":
        if flag:
            # Convert to exajoules
            f = df * 0.001
        else:
            # Convert to kilowatt-hours
            df = df * (1000000000000/3600)

    return df

# tonnes = metric ton = 1000 kg


In [1071]:
def rowIndices(df:pd.DataFrame):
    """
    Return the row labels of a pd.DataFrame

    Parameters:
    - df (pd.DataFrame): a dataframe containing an excel sheet
    """

    return [row for row in df.index]


In [1072]:
# Process dataframes

# Unit: Tonnes
nrg_emi_df = processData(nrg_emi_df)
flar_emi_df = processData(flar_emi_df)
equi_emi_df = processData(equi_emi_df)

# Unit: Kilowatt-hours
hydro_pro_df = processData(hydro_pro_df)
solar_pro_df = processData(solar_pro_df)
wind_pro_df = processData(wind_pro_df)
geo_pro_df = processData(geo_pro_df)
bio_pro_df = processData(bio_pro_df)

# Unit: Exajoules
hydro_con_df = processData(hydro_con_df)
solar_con_df = processData(solar_con_df)
wind_con_df = processData(wind_con_df)
geo_con_df = processData(geo_con_df)
bio_con_df = processData(bio_con_df, True)

# Unit: Exajoules
oil_con_df = processData(oil_con_df)
gas_con_df = processData(gas_con_df)
coal_con_df = processData(coal_con_df)
nuc_con_df = processData(nuc_con_df)

# Unit: Exajoules
tol_con_df = processData(tol_con_df)

In [1073]:
# Convert to numpy arrays

nrg_emi = nrg_emi_df.to_numpy()
flar_emi = flar_emi_df.to_numpy()
equi_emi = equi_emi_df.to_numpy()
hydro_p = hydro_pro_df.to_numpy()
solar_p = solar_pro_df.to_numpy()
wind_p = wind_pro_df.to_numpy()
geo_p = geo_pro_df.to_numpy()
bio_p = bio_pro_df.to_numpy()

hydro_c = hydro_con_df.to_numpy()
solar_c = solar_con_df.to_numpy()
wind_c = wind_con_df.to_numpy()
geo_c = geo_con_df.to_numpy()
bio_c = bio_con_df.to_numpy()
oil_c = oil_con_df.to_numpy()
gas_c = gas_con_df.to_numpy()
coal_c = coal_con_df.to_numpy()
nuc_c = nuc_con_df.to_numpy()

tol_c = tol_con_df.to_numpy()

# print(len(rowIndices(hydro_con_df)), len(rowIndices(solar_con_df)), len(rowIndices(wind_con_df)), len(rowIndices(geo_con_df)), len(rowIndices(bio_con_df)))
# print(len(rowIndices(oil_con_df)), len(rowIndices(gas_con_df)), len(rowIndices(coal_con_df)), len(rowIndices(nuc_con_df)))
# print(len(rowIndices(tol_con_df)))

# Get row indices of dataframes
# There are three unique indices/list of countries

# All of these dataframes (and their NDarray equivalents) have 83 indices.
# Their row indices are shown in nrg_emi_indices
#
# nrg_emi_df
# equi_emi_df
# hydro_pro_df
# solar_pro_df
# wind_pro_df
# geo_pro_df
# hydro_con_df
# solar_con_df
# wind_con_df
# geo_con_df
# oil_con_df
# gas_con_df
# coal_con_df
# nuc_con_df
# tol_con_df

# All of these dataframes (and their NDarray equivalents) have 41 indices
# Their row indices are shown in flar_emi_indices
# 
# flar_emi_df

# All of these dataframes (and their NDarray equivalents) have 24 indices
# Their row indices are shown in bio_indices
#
# bio_pro_df
# bio_con_df

# The rest of the dataframes share the same index list as nrg_emi_indices
nrg_emi_indices = rowIndices(nrg_emi_df)
flar_emi_indices = rowIndices(flar_emi_df)
bio_indices = rowIndices(bio_pro_df)

<b>nrg_emi_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru', 'Trinidad & Tobago', 'Venezuela', 'Central America', 'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom', 'Azerbaijan', 'Belarus', 'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'Uzbekistan', 'Iran', 'Iraq', 'Israel', 'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates', 'Algeria', 'Egypt', 'Morocco', 'South Africa', 'Eastern Africa', 'Middle Africa', 'Western Africa', 'Australia', 'Bangladesh', 'China', 'China Hong Kong SAR', 'India', 'Indonesia', 'Japan', 'Malaysia', 'New Zealand', 'Pakistan', 'Philippines', 'Singapore', 'South Korea', 'Sri Lanka', 'Taiwan', 'Thailand', 'Vietnam']

<b>flar_emi_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Colombia', 'Peru', 'Trinidad & Tobago', 'Venezuela', 'Denmark', 'Germany', 'Italy', 'Netherlands', 'Norway', 'Poland', 'Romania', 'Ukraine', 'United Kingdom', 'Azerbaijan', 'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'Uzbekistan', 'Iran', 'Iraq', 'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates', 'Algeria', 'Egypt', 'Australia', 'Bangladesh', 'China', 'India', 'Indonesia', 'Malaysia', 'Pakistan', 'Thailand', 'Vietnam']

<b>biofuel_indices:</b>

['Canada', 'Mexico', 'US', 'Argentina', 'Brazil', 'Colombia', 'Austria', 'Belgium', 'Finland', 'France', 'Germany', 'Italy', 'Netherlands', 'Poland', 'Portugal', 'Spain', 'Sweden', 'United Kingdom', 'Australia', 'China', 'India', 'Indonesia', 'South Korea', 'Thailand']

<b>Shape of nrg_emi:</b>

(83, 33)

<b>Columns of every dataframe:</b>

Index([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
      dtype='object')


In [1074]:
# Massive 3D numpy array for label making
# 1st dimension - Years. 33 years from 1990-2022 (inclusive)
# 2nd dimension - Countries/Regions. 83 unique countries/regions
# 3rd dimension - Carbon Neutral features. 8 features (in this order): energy emissions, flaring emissions, CO2 equivalent emissions, 
# hydroelectric production, solar production, wind production, geothermal production, biofuel production
# (33, 83, 8)

# Find every unique country/region
# This is a bit redundant because every country in flar_emi_indices and bio_indices 
# is already in nrg_emi_indices
cotry_reg = list(set(nrg_emi_indices + flar_emi_indices + bio_indices))
cotry_reg.sort()
print(cotry_reg)

dim_1 = []
for year_indx in range(33):
    dim_2 = []
    for area in cotry_reg:
        # There's no area check for the upcoming data
        # because every area has this data
        indx = nrg_emi_indices.index(area)
        # Extract a float
        a_nrg_emi = nrg_emi[indx][year_indx]
        a_equi_emi = equi_emi[indx][year_indx]
        a_hydro = hydro_p[indx][year_indx]
        a_solar = solar_p[indx][year_indx]
        a_wind = wind_p[indx][year_indx]
        a_geo = geo_p[indx][year_indx]

        if area in flar_emi_indices:
            indx = flar_emi_indices.index(area)
            # Extract a float
            a_flar_emi = flar_emi[indx][year_indx]
        else:
            a_flar_emi = 0.

        if area in bio_indices:
            indx = bio_indices.index(area)
            # Extract a float
            a_biofuel = bio_p[indx][year_indx]
        else:
            a_biofuel = 0.

        # Is also a set of features
        dim_3 = [a_nrg_emi,
                a_flar_emi,
                a_equi_emi,
                a_hydro,
                a_solar,
                a_wind,
                a_geo,
                a_biofuel]
        dim_2.append(dim_3)
    dim_1.append(dim_2)

# Label Statistical Review 
# Full of floats
lsr = np.array(dim_1)
print(f"lsr shape: {lsr.shape}")


['Algeria', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Brazil', 'Bulgaria', 'Canada', 'Central America', 'Chile', 'China', 'China Hong Kong SAR', 'Colombia', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Eastern Africa', 'Ecuador', 'Egypt', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico', 'Middle Africa', 'Morocco', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russian Federation', 'Saudi Arabia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Trinidad & Tobago', 'Turkey', 'Turkmenistan', 'US', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'Uzbekistan', 'Venezu

In [1075]:
# Massive 3D numpy array for classification
# 1st dimension - Years. 33 years from 1990-2022 (inclusive)
# 2nd dimension - Countries/Regions. 83 unique countries/regions
# 3rd dimension - Energy Consumption features. 9 features (in this order): oil, gas, coal, nuclear, 
# hydroelectric, solar, wind, geothermal, biofuel
# (33, 83, 9)

print(cotry_reg)

dim_1 = []
for year_indx in range(33):
    dim_2 = []
    for area in cotry_reg:
        # There's no area check for the upcoming data
        # because every area has this data
        indx = nrg_emi_indices.index(area)
        # Extract a float
        tol_consume = tol_c[indx][year_indx]
        a_hydro = hydro_c[indx][year_indx] / tol_consume
        a_solar = solar_c[indx][year_indx] / tol_consume
        a_wind = wind_c[indx][year_indx] / tol_consume
        a_geo = geo_c[indx][year_indx] / tol_consume
        a_oil = oil_c[indx][year_indx] / tol_consume
        a_gas = gas_c[indx][year_indx] / tol_consume
        a_coal = coal_c[indx][year_indx] / tol_consume
        a_nuc = nuc_c[indx][year_indx] / tol_consume

        if area in bio_indices:
            indx = bio_indices.index(area)
            # Extract a float
            a_biofuel = bio_c[indx][year_indx] / tol_consume
        else:
            a_biofuel = 0.

        # Is also a set of features
        dim_3 = [a_oil, a_gas, a_coal, a_nuc,
                 a_hydro, a_solar, a_wind, a_geo, a_biofuel]
        dim_2.append(dim_3)
    dim_1.append(dim_2)

# Classification Statistical Review 
# Full of floats
csr = np.array(dim_1)
print(f"csr shape: {csr.shape}")


['Algeria', 'Argentina', 'Australia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Brazil', 'Bulgaria', 'Canada', 'Central America', 'Chile', 'China', 'China Hong Kong SAR', 'Colombia', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Eastern Africa', 'Ecuador', 'Egypt', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico', 'Middle Africa', 'Morocco', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russian Federation', 'Saudi Arabia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Trinidad & Tobago', 'Turkey', 'Turkmenistan', 'US', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'Uzbekistan', 'Venezu

In [1076]:
def makeLabel(features):
    """
    Make a label for a sample

    - features (np.ndarray): set of 8 features
    """
    # Unit: Tonnes of Carbon Dioxide
    co2 = np.sum(features[:3])
    # Unit: Kilowatt-hours
    renewable = np.sum(features[3:])

    # Electricity reductions emission factor
    # 0.000709 tonnes CO2/kWh
    # Unit: Tonnes of Carbon Dioxide
    renewable *= 0.000709

    # Remaining co2 after being offset by renewable energy production
    rem_co2 = max(co2 - renewable, 0)

    # if rem_co2 == co2:
    #     return 6
    if rem_co2 == 0:
        return 0
    else:
        percent = (rem_co2/co2) * 100
        # Equivalent to np.floor(percent / 10)
        # The label is the tens place of the percentage
        # return int(np.floor(percent / 10))
        if percent > 0.0 and percent <= 20.0:
            label = 1
        elif percent > 20.0 and percent <= 40.0:
            label = 2
        elif percent > 40.0 and percent <= 60.0:
            label = 3
        elif percent > 60.0 and percent <= 80.0:
            label = 4
        elif percent > 80.0 and percent <= 100.0:
            label = 5
        else:
            label = 5
        return label


In [1077]:
# Make labels for all of the data/samples/examples 
# An individual feature isn't a example, but a location in a particular year is
# Thus, there are 33 * 83 = 2739 examples

# There are 6 possible labels, 0-5
# 0 means carbon neutral is achieved
# 6 means the country is absolutely nowhere near carbon neutrality
labels = np.array([[makeLabel(location) for location in year] for year in lsr])
print({f"labels shape: {labels.shape}"})

{'labels shape: (33, 83)'}


In [1078]:
# Split the big dataset into a three subsets: labeled training, unlabeled training, and test

# Labeled training subset
# Years 1990-1992
lab_set = csr[0:3]
lab_set_label = labels[0:3]

# Unlabeled training subset
# Years 1993-2020
unlab_set = csr[3:31]
unlab_set_label = labels[3:31]

# Test subset
# Years 2021-2022
test_set = csr[31::]
test_set_label = labels[31::]

print("Labeled training subset, Years 1990-1991")
print(f"lab_set shape: {lab_set.shape}")
print(f"lab_set_label shape: {lab_set_label.shape}\n")
print("Unlabeled training subset, Years 1992-2020")
print(f"unlab_set shape: {unlab_set.shape}")
print(f"unlab_set_label shape: {unlab_set_label.shape}\n")
print("Test subset, Years 2021-2022")
print(f"test_set shape: {test_set.shape}")
print(f"test_set_label shape: {test_set_label.shape}")

Labeled training subset, Years 1990-1991
lab_set shape: (3, 83, 9)
lab_set_label shape: (3, 83)

Unlabeled training subset, Years 1992-2020
unlab_set shape: (28, 83, 9)
unlab_set_label shape: (28, 83)

Test subset, Years 2021-2022
test_set shape: (2, 83, 9)
test_set_label shape: (2, 83)


<u>Classification Plan</u>
- 1st Training phase: Train a classifier on the first two labeled years of the data
- 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most information will be chosen to get their true label. The remaining examples will get pseudo-labeled
- 1st Evaluate phase: Use the newly trained classifier to evaluate the data from 2021 and 2022
- Predict phase: Use time series forecasting (RNN) to predict a country's set of features until 2050
- 2nd Evaluate phase: Use the classifier to predict levels of CN

In [1079]:
# accuracy_fn()
def accuracy_fn(pred_labels, true_labels):
    """Returns the accuracy of a set of predicted labels
    
    Parameters:
    - pred_labels (ndarray): array of predicted labels, shape=(91,)
    - labels (ndarray): array of true labels, shape=(91,)

    Returns:
    - accuracy (float): accuracy of predictions
    """

    correct = np.sum([1 for label1, label2 in zip(pred_labels, true_labels) if label1 == label2])

    accuracy = correct / len(true_labels)

    return accuracy
    

In [1080]:
x = np.random.random((3, 83, 8))
y = np.concatenate(x, axis=0)
y.shape

(249, 8)

In [1081]:
# Batch Active Learning
def train_labeled(classifier, lab_data, lab_label, epoch):
    """Train on the current labeled dataset

    Parameters:
    - classifier (classifier type): classifier-in-training
    - lab_data (ndarray): current labeled data, shape=(# of years, 83, 9)
    - lab_label (ndarray): labels for current labeled data, shape=(# of years, 83)
    - epoch (int): cycles to run the training for

    Returns:
    - classifier (classifier type): trained classifier
    """
    for _ in range(epoch):
        # Labeled examples have to be concatenated because not every
        # year's worth of data contains examples of every class
        # ex. If the classifier trains on data belonging to
        # only 8/11 classes, predict_proba() will only return
        # probabilities for these 8/11 classes and will ignore
        # the possibility of the 3 others
        labeled_examples = np.concatenate(lab_data, axis=0)
        labels = np.concatenate(lab_label, axis=0)
        # labeled_examples.shape = (# of years * 83, 9)
        # labels.shape = (# of years * 83,)
        classifier.fit(labeled_examples, labels)

    return classifier

def predict_unlabeled(classifier, batch_data):
    """Predict on the unlabeled data of a year
    
    Parameters:
    - classifier (classifier type): a trained classifier
    - batch_data (ndarray): a year's worth of unlabeled data, shape=(83, 9)

    Returns:
    - pred_class (ndarray): array of predicted classes, shape=(83,)
    - pred_proba (ndarray): array of array of class probabilities, shape=(83, 6)
    """
    pred_class = np.array(classifier.predict(batch_data))
    pred_proba = np.array(classifier.predict_proba(batch_data))

    return pred_class, pred_proba
    
def batch_active_learning(classifier, lab_data, lab_label, unlab_data, unlab_label, confident_threshold, epoch):
    """Train a classifier using batch active learning
    
    Parameters:
    - classifier: a classifier from the scikit-learn (sklearn) module 
    - lab_data (ndarray): the labeled dataset, inital shape=(3, 83, 9)
    - lab_label (ndarray): the labled dataset's labels, inital shape=(3, 83)
    - unlab_data (ndarray): the unlabeled dataset, inital shape=(28, 83, 9)
    - unlab_label (ndarray): the unlabeled dataset's labels, inital shape=(28, 83)
    - confident_threshold (float): threshold for the algorithm to request labels
    - epoch (int): number of epoches training will last for

    Returns:
    - classifier (classifier type): trained classifier
    """

    index = 0
    episode = 1
    # classifier = train_labeled(classifier, lab_data, lab_label, epoch)
    while index < 28:
        print(f"Episode {episode}: ")

        classifier = train_labeled(classifier, lab_data, lab_label, epoch)

        # Predict on the next batch of unlabeled data
        # 1 year is a batch
        # 4 batches per episode
        batch_data = []
        batch_label = []
        for modifier in range(4):
            batch_data.append(unlab_data[index + modifier])
            batch_label.append(unlab_label[index + modifier])

        # np.shape(batch_data) = (4, 83, 9)
        # np.shape(batch_label) = (4, 83)

        pred_class = []
        pred_proba = []
        for batch in batch_data:
            prediction_class, pred_probability = predict_unlabeled(classifier, batch)
            pred_class.append(prediction_class)
            pred_proba.append(pred_probability)
        print(f"score: {classifier.score(np.concatenate(batch_data, axis=0), np.concatenate(batch_label, axis=0))}")

        # np.shape(pred_class) = (4, 83)
        # np.shape(pred_proba) = (4, 83, 6)

        # Choose which examples to request a true label for
        # For these examples, replace their predicted label with their true label
        # Remember that the order of examples in pred_class, pred_proba, batch_data, and batch_label are the same
        # Ex. The label information of the example at index 0 of batch_data is found at index 0 of the other arrays
        uncertain = 0
        # 4 cycles
        for i, batch_proba in enumerate(pred_proba):
            # 83 cycles
            for j, probas in enumerate(batch_proba):
                pred = np.max(probas)
                if pred < confident_threshold:
                    uncertain += 1
                    pred_class[i][j] = batch_label[i][j]

        print(f"{uncertain} label request(s) made")

        # Reshape batch_data and pred_class for np.append()
        # rbatch_data = np.reshape(batch_data, (1, 83, 9))
        # rpred_class = np.reshape(pred_class, (1, 83))

        # classifier = train_labeled(classifier, batch_data, pred_class, epoch)

        # Add the newly pseudo-labeled, and any true-labeled, examples to the labeled data set
        lab_data = np.append(lab_data, batch_data, axis=0)
        lab_label = np.append(lab_label, pred_class, axis=0)

        index += 4
        episode += 1

    # Train one last time with all the passed examples, labeled and pseudo-labeled
    classifier = train_labeled(classifier, lab_data, lab_label, epoch)
        
    return classifier

In [1082]:
# Batch active learning hyperparameters aka model parameters
# These are different from real model parameters that are estimated by the model itself

n_estimators = 1000
max_iter = 1000
learning_rate = 0.01
max_depth = 50
confident_threshold = 0.70
epoch = 1
n_classes = 11

In [1083]:
# 1st Training phase: Train a classifier on the first two labeled years of the data
# 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most 


# Gaussian Naive Bayes isn't an option because the data distribution isn't gaussian/normal due to lacking a "symmetric bell shape". 
# Most of the data labels are on the high end of the scale. Thus, the data's bell shape isn't symmetric
# Bernoulli Naive Bayes isn't an option because sample features must be binary-valued (Bernoulli, boolean)
# Multinomial, Complement, and Categorical aren't considered  due to data being classified moreso out of probability rather than certainty.

# classifier = RandomForestClassifier(n_estimators=n_estimators, criterion="log_loss", max_depth=max_depth)


classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

classifier = batch_active_learning(classifier, 
                                   np.copy(lab_set), np.copy(lab_set_label), 
                                   np.copy(unlab_set), np.copy(unlab_set_label), 
                                   confident_threshold, epoch)

Episode 1: 


score: 0.9457831325301205
7 label request(s) made
Episode 2: 
score: 0.9096385542168675
14 label request(s) made
Episode 3: 
score: 0.9006024096385542
17 label request(s) made
Episode 4: 
score: 0.9096385542168675
9 label request(s) made
Episode 5: 
score: 0.8614457831325302
15 label request(s) made
Episode 6: 
score: 0.8162650602409639
4 label request(s) made
Episode 7: 
score: 0.7349397590361446
12 label request(s) made


In [1084]:
joblib.dump(classifier, "models/grad_boost1.pkl")

['models/grad_boost1.pkl']

In [1085]:
# evaluate()

def evaluate(classifier, test_data, test_label):
    """Have a classifier evaluate test data and return the accuracy
    
    Parameters:
    - classifier: a classifier trained on labeled and pseudo-labeled data
    - test_data (ndarray): the test dataset, inital shape=(2, 83, 8)
    - test_label (ndarray): the test dataset's labels, inital shape=(2, 83)

    Returns:
    - accuracy (float): the accuracy of the classifier's predictions on the test dataset
    """

    accuracies = []
    for test_examples, labels in zip(test_data, test_label):
        # test_examples.shape = (91, 8)
        # labels.shape = (91,)
        preds = np.array(classifier.predict(test_examples))
        accuracies.append(accuracy_fn(preds, labels))
    print(accuracies)
    accuracy = round(np.mean(accuracies) * 100, 2)

    return accuracy
    

In [1086]:
# 1st Evaluate phase: Use the newly trained classifier to evaluate the data from 2021 and 2022
accuracy = evaluate(joblib.load("models/grad_boost1.pkl"), test_set, test_set_label)
accuracy

[0.6746987951807228, 0.7108433734939759]


69.28

In [166]:
print(set(labels[0]))
print(set(labels[1]))
print(set(labels[2]))

print(set(lab_set_label[0]))
print(set(lab_set_label[1]))
print(set(lab_set_label[2]))

{0, 1, 2, 3, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 5, 6, 7, 8, 9, 10}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
{0, 1, 4, 5, 6, 7, 8, 9, 10}


In [53]:
# RNN classifier class
class RecurrentNeuralNetwork(nn.Module):
    def __init__(self):
        super(RecurrentNeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(in_features=8, out_features=16)
        # nn.ReLU() doesn't need parameters in this case
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=16, out_features=16)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(in_features=16, out_features=16)
        self.activation3 = nn.ReLU()
        # self.batchNorm = nn.BatchNorm1d()
        # self.flatten = nn.Flatten()
        # self.dropout1 = nn.Dropout()
        self.dense1 = nn.Linear(in_features=16, out_features=1)
        # self.dropout2 = nn.Dropout()
        # self.dense2 = nn.Linear()
        # self.dropout3 = nn.Dropout()
        # self.dense3 = nn.Linear()
        # self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        x = self.activation3(x)
        # x = self.batchNorm(x)
        # x = self.flatten(x)
        # x = self.dropout1(x)
        x = self.dense1(x)
        # x = self.dropout2(x)
        # x = self.dense2(x)
        # x = self.dropout3(x)
        # x = self.dense3(x)

        return x

In [59]:
device = "cuda" if torch.cuda.is_available() else "cpu"
RNN = RecurrentNeuralNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(RNN.parameters(), lr=0.0001)

In [None]:
# Time series forecasting