<u>Classification Plan</u>
- 1st Training phase: Train a classifier on the first two labeled years of the data
- 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most information will be chosen to get their true label. The remaining examples will get pseudo-labeled
- 1st Evaluate phase: Use the newly trained classifier to evaluate the data from 2021 and 2022
- Predict phase: Use time series forecasting (RNN) to predict a country's set of features until 2050
- 2nd Evaluate phase: Use the classifier to predict levels of CN

In [None]:
# Batch Active Learning
def train_labeled(classifier, lab_data, lab_label, epoch):
    """Train on the current labeled dataset

    Parameters:
    - classifier (classifier type): classifier-in-training
    - lab_data (ndarray): current labeled data, shape=(# of years, 83, 9)
    - lab_label (ndarray): labels for current labeled data, shape=(# of years, 83)
    - epoch (int): cycles to run the training for

    Returns:
    - classifier (classifier type): trained classifier
    """
    for _ in range(epoch):
        # Labeled examples have to be concatenated because not every
        # year's worth of data contains examples of every class
        # ex. If the classifier trains on data belonging to
        # only 8/11 classes, predict_proba() will only return
        # probabilities for these 8/11 classes and will ignore
        # the possibility of the 3 others
        labeled_examples = np.concatenate(lab_data, axis=0)
        labels = np.concatenate(lab_label, axis=0)
        # labeled_examples.shape = (# of years * 83, 9)
        # labels.shape = (# of years * 83,)
        classifier.fit(labeled_examples, labels)

    return classifier

def predict_unlabeled(classifier, batch_data):
    """Predict on the unlabeled data of a year
    
    Parameters:
    - classifier (classifier type): a trained classifier
    - batch_data (ndarray): a year's worth of unlabeled data, shape=(83, 9)

    Returns:
    - pred_class (ndarray): array of predicted classes, shape=(83,)
    - pred_proba (ndarray): array of array of class probabilities, shape=(83, 6)
    """
    pred_class = np.array(classifier.predict(batch_data))
    pred_proba = np.array(classifier.predict_proba(batch_data))

    return pred_class, pred_proba
    
def batch_active_learning(classifier, lab_data, lab_label, unlab_data, unlab_label, confident_threshold, epoch):
    """Train a classifier using batch active learning
    
    Parameters:
    - classifier: a classifier from the scikit-learn (sklearn) module 
    - lab_data (ndarray): the labeled dataset, inital shape=(3, 83, 9)
    - lab_label (ndarray): the labled dataset's labels, inital shape=(3, 83)
    - unlab_data (ndarray): the unlabeled dataset, inital shape=(28, 83, 9)
    - unlab_label (ndarray): the unlabeled dataset's labels, inital shape=(28, 83)
    - confident_threshold (float): threshold for the algorithm to request labels
    - epoch (int): number of epoches training will last for

    Returns:
    - classifier (classifier type): trained classifier
    """

    index = 0
    episode = 1
    # classifier = train_labeled(classifier, lab_data, lab_label, epoch)
    while index < 28:
        print(f"Episode {episode}: ")

        classifier = train_labeled(classifier, lab_data, lab_label, epoch)

        # Predict on the next batch of unlabeled data
        # 1 year is a batch
        # 4 batches per episode
        batch_data = []
        batch_label = []
        for modifier in range(4):
            batch_data.append(unlab_data[index + modifier])
            batch_label.append(unlab_label[index + modifier])

        # np.shape(batch_data) = (4, 83, 9)
        # np.shape(batch_label) = (4, 83)

        pred_class = []
        pred_proba = []
        for batch in batch_data:
            prediction_class, pred_probability = predict_unlabeled(classifier, batch)
            pred_class.append(prediction_class)
            pred_proba.append(pred_probability)
        print(f"score: {classifier.score(np.concatenate(batch_data, axis=0), np.concatenate(batch_label, axis=0))}")

        # np.shape(pred_class) = (4, 83)
        # np.shape(pred_proba) = (4, 83, 6)

        # Choose which examples to request a true label for
        # For these examples, replace their predicted label with their true label
        # Remember that the order of examples in pred_class, pred_proba, batch_data, and batch_label are the same
        # Ex. The label information of the example at index 0 of batch_data is found at index 0 of the other arrays
        uncertain = 0
        # 4 cycles
        for i, batch_proba in enumerate(pred_proba):
            # 83 cycles
            for j, probas in enumerate(batch_proba):
                pred = np.max(probas)
                if pred < confident_threshold:
                    uncertain += 1
                    pred_class[i][j] = batch_label[i][j]

        print(f"{uncertain} label request(s) made")

        # Reshape batch_data and pred_class for np.append()
        # rbatch_data = np.reshape(batch_data, (1, 83, 9))
        # rpred_class = np.reshape(pred_class, (1, 83))

        # classifier = train_labeled(classifier, batch_data, pred_class, epoch)

        # Add the newly pseudo-labeled, and any true-labeled, examples to the labeled data set
        lab_data = np.append(lab_data, batch_data, axis=0)
        lab_label = np.append(lab_label, pred_class, axis=0)

        index += 4
        episode += 1

    # Train one last time with all the passed examples, labeled and pseudo-labeled
    classifier = train_labeled(classifier, lab_data, lab_label, epoch)
        
    return classifier

In [None]:
# Batch active learning hyperparameters aka model parameters
# These are different from real model parameters that are estimated by the model itself

n_estimators = 1000
max_iter = 1000
learning_rate = 0.01
max_depth = 50
confident_threshold = 0.70
epoch = 1
n_classes = 11
n_queries = 5

In [None]:
# 1st Training phase: Train a classifier on the first two labeled years of the data
# 2nd Training phase: Use the classifier and batch active learning on the rest of the unlabeled data until 2021. Examples that would provide the most 


# Gaussian Naive Bayes isn't an option because the data distribution isn't gaussian/normal due to lacking a "symmetric bell shape". 
# Most of the data labels are on the high end of the scale. Thus, the data's bell shape isn't symmetric
# Bernoulli Naive Bayes isn't an option because sample features must be binary-valued (Bernoulli, boolean)
# Multinomial, Complement, and Categorical aren't considered  due to data being classified moreso out of probability rather than certainty.

classifier = RandomForestClassifier(n_estimators=n_estimators, criterion="log_loss", max_depth=max_depth)

# classifier = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)

classifier = batch_active_learning(classifier, 
                                   np.copy(lab_set), np.copy(lab_set_label), 
                                   np.copy(unlab_set), np.copy(unlab_set_label), 
                                   confident_threshold, epoch)

In [None]:
# RNN classifier class
class RecurrentNeuralNetwork(nn.Module):
    def __init__(self):
        super(RecurrentNeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(in_features=8, out_features=16)
        # nn.ReLU() doesn't need parameters in this case
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=16, out_features=16)
        self.activation2 = nn.ReLU()
        self.linear3 = nn.Linear(in_features=16, out_features=16)
        self.activation3 = nn.ReLU()
        # self.batchNorm = nn.BatchNorm1d()
        # self.flatten = nn.Flatten()
        # self.dropout1 = nn.Dropout()
        self.dense1 = nn.Linear(in_features=16, out_features=1)
        # self.dropout2 = nn.Dropout()
        # self.dense2 = nn.Linear()
        # self.dropout3 = nn.Dropout()
        # self.dense3 = nn.Linear()
        # self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        x = self.activation3(x)
        # x = self.batchNorm(x)
        # x = self.flatten(x)
        # x = self.dropout1(x)
        x = self.dense1(x)
        # x = self.dropout2(x)
        # x = self.dense2(x)
        # x = self.dropout3(x)
        # x = self.dense3(x)

        return x

In [None]:
a = np.array([[[0, 1],
               [2, 3],
               [4, 5]],
              [[6, 7],
               [8, 9],
               [10, 11]],
              [[12, 13],
               [14, 15],
               [16, 17]],
              [[18, 19],
               [20, 21],
               [22, 23]],
              [[24, 25],
               [26, 27],
               [28, 29]]])

print(a.shape)

a = np.transpose(a, [1, 0, 2])

print(a.shape)

a

In [None]:
x = np.random.random((2, 4))
print(x)
y = x[:, 0:2]
print(y)
z = x[:, 2:]
print(z)