# Active Learning Algorithm

In [4]:
import re


import nltk
import numpy

import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from gensim import corpora, similarities
from gensim.models import TfidfModel
from gensim.corpora import Dictionary, MmCorpus
from gensim.similarities import Similarity
from nltk.stem import PorterStemmer


from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

import pandas as pd
import preprocessor as tp

#from imblearn.ensemble import BalancedBaggingClassifier
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

import random
import copy
import argparse
import scipy.sparse as sp
import numpy as np
import time
import math
from random import randint
import queue
import pickle
import os
from multiprocessing import Pool as ProcessPool
import itertools
from functools import partial

# Settings


In [5]:
nltk.download('punkt')

warnings.filterwarnings("ignore")

ps = PorterStemmer()

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#tp.set_options(tp.OPT.URL, tp.OPT.MENTION)

rng = np.random.seed(5)
random.seed(5)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brianllinas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Creation of Functions

## def active_learning_multi_processing

In [8]:
def calculate_entropy(p, q):
    if p == 0 or q == 0:
        return 0
    return -1*(p*math.log(p, 2) + q*math.log(q, 2))

# actual active learning for TREC is happening here for a particular topicID
# here we run for either all documents in the collection
# or all documents in the official qrels

def active_learning_multi_processing(topicID, df, al_protocol, al_classifier, document_collection, topic_seed_info, 
                                     topic_complete_qrels_address, train_per_centage, use_pooled_budget, 
                                     per_topic_budget_from_trec_qrels, feature_type):
    train_index_list = topic_seed_info[topicID]
    
    original_labels = {}
    for row_index, row in df.iterrows():
        original_labels[row_index] = row['majority_label']

    original_predicted_merged_dict = {}
    original_label_list = []
    number_of_1 = 0
    for k, v in original_labels.items():
        original_predicted_merged_dict[k] = v
        if v == 1.0:
            number_of_1 += 1
        original_label_list.append(v)

    original_predicted_merged_list = []
    for k in sorted(original_predicted_merged_dict.keys()):
        original_predicted_merged_list.append(original_predicted_merged_dict[k])

    # need to convert y to np.array the Y otherwise Y[train_index_list] does not work directly on a list
    y = np.array(original_predicted_merged_list)
    # type needed beacause y is an object need and thorws error Unknown label type: 'unknown'
    y = y.astype('int')

    total_documents = len(y)
    total_document_set = set(np.arrange(0, total_documents, 1))

    initial_X_test = []
    test_index_dictionary = {}
    test_index_counter = 0

    for train_index in range(0, total_documents):
        if train_index not in train_index_list:
            initial_X_test.append(document_collection[train_index])
            test_index_dictionary[test_index_counter] = train_index
            test_index_counter += 1

    predicatbleSize = len(initial_X_test)
    isPredictable = [1] * predicatbleSize # initially we will predict all

    # initializing the train_size controller
    train_size_controller = len(train_index_list)
    loopCounter = 1 # loop starts from 1 because 0 is for seed_set
    topic_all_info = {} # key is the loopCounter

    while True:
        #print "iteration:", loopCounter
        # here modeling is utilizing the document selected in previous
        # iteration for training
        # when loopCounter == 0
        # model is utilizing all the seed document collected at the begining
        if al_classifier == 'LR':
            #model = LogisticRegression(solver=large_data_solver, C=large_data_C_parameter, max_iter=200)
            model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

        elif al_classifier == 'SVM':
            model = SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability = True)
        elif al_classifier == 'RF':
            model =  RandomForestClassifier(n_estimators=10, max_depth=10, random_state=0)
        elif al_classifier == 'RFN':
            model = RandomForestClassifier(n_estimators=10, max_depth=None, random_state=0)
        elif al_classifier == 'RFN100':
            model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
        elif al_classifier == 'NB':
            model = MultinomialNB()
        elif al_classifier == 'Ada':
            # base model is decision tree
            # logistic regression will not help
            model = AdaBoostClassifier(n_estimators=50, learning_rate=1)
        elif al_classifier == 'Xgb':
            model = XGBClassifier(random_state=1, learning_rate=0.01)
        elif al_classifier == 'BagLR':
            LRmodel = LogisticRegression(solver=large_data_solver, C=large_data_C_parameter, max_iter=200)
            model = BaggingClassifier(LRmodel, n_estimators = 5, max_samples = 1) # If float, then draw max_samples * X.shape[0] samples. 1 means use all samples
        elif al_classifier == 'BagNB':
            model = BaggingClassifier(MultinomialNB(), n_estimators = 5, max_samples = 0.5) # If float, then draw max_samples * X.shape[0] samples. 1 means use all samples
        elif al_classifier == 'Vot':
            LRmodel = LogisticRegression(solver=large_data_solver, C=large_data_C_parameter, max_iter=200)
            NBmodel = MultinomialNB()
            model = VotingClassifier(estimators=[('lr', LRmodel), ('nb', NBmodel)], voting = 'soft')

        model.fit(document_collection[train_index_list], y[train_index_list])

        test_index_list = list(total_document_set - set(train_index_list))
        pooled_document_count = len(set(train_index_list).intersection(set(original_label_list)))

        y_actual = None
        y_pred = None
        y_pred_all = []

        if isPredictable.count(1) != 0:
            y_pred = []
            for test_index_elem in test_index_list:
                if feature_type == 'tfidf':
                    y_pred.append(model.predict(document_collection[test_index_elem]))
                elif feature_type == 'bert' or feature_type == 'robert':
                    y_pred.append(model.predict(document_collection[test_index_elem]))

            start = time.time()
            y_actual = np.concatenate((y[train_index_list], y[test_index_list]), axis=None)
            y_pred_all = np.concatenate((y[train_index_list], y_pred), axis=None)
            '''
            for doc_index in range(0,total_documents):
                if doc_index in train_index_list:
                    y_pred_all.append(y[doc_index])
                else:
                    # result_index in test_set
                    # test_index_list is a list of doc_index
                    # test_Index_list [25, 9, 12]
                    # test_index_list[0] = 25 and its prediction in y_pred[0] --one to one mapping
                    # so find the index of doc_index in test_index_list using
                    pred_index = test_index_list.index(doc_index)
                    y_pred_all.append(y_pred[pred_index])
            '''

        else: # everything in trainset
            y_pred = y
            y_actual = y
            y_pred_all = y
            test_index_list = train_index_list

        f1score = f1_score(y_actual, y_pred_all, average='binary')
        precision = precision_score(y_actual, y_pred_all, average='binary')
        recall = recall_score(y_actual, y_pred_all, average='binary')

        number_of_1_found_so_far = list(y[train_index_list]).count(1)
        prevalence = (number_of_1_found_so_far*1.0)/number_of_1

        # save all info using (loopCounter - 1) 
        # list should be deep_copy otherwise all will point to final reference at final iteration
        topic_all_info[loopCounter - 1] = (topicID, f1score, precision, recall, copy.deepcopy(train_index_list), test_index_list, y_pred, pooled_document_count, prevalence)

        # it means everything in the train list and we do not need to predict 
        # so we do not need any training of the model
        if isPredictable.count(1) == 0:
            break
        # suppose original budget is 5
        # then when train_index_list is 5, we cannot just turn off Active Learning
        # we need to use that AL with train_index_list = 5 to train use that to predict the rest
        # so we cannot exit at 5, we should exit at 5 + 1
        # that is the reason we set per_topic_budget_from_trec_qrels[topicId] + 1 where 1 is the batch size
        # it means everything of pooled_budget size is in the train_index_list so we need not tany training of the model
        # so break here
        if use_pooled_budget == 1 and per_topic_budget_from_trec_qrels[topicID] == len(train_index_list):
            break

        queueSize = isPredictable.count(1)
        my_queue = queue.PriorityQueue(queueSize)

        # these are used for SPL
        randomArray = []

        for counter in range(0, predicatbleSize):
            if isPredictable[counter] == 1:
                # model.predict returns a list of values in so we need index [0] as we
                # have only one element in the list
                y_prob = None
                if feature_type == 'tfidf':
                    y_prob = model.predict_proba(initial_X_test[counter])[0]
                elif feature_type == 'bert' or feature_type == 'robert':
                    y_prob = model.predict_proba(initial_X_test[counter])[0]

                    val = 0
                    if al_protocol == 'CAL':
                        val = (-1)*y_prob[1] # -1 is needed priority do sorting increasing 
                        my_queue.put((val, counter))
                    elif al_protocol == 'SAL':
                        val = (-1)*calculate_entropy(y_prob[0], y_prob[1])
                        my_queue.put((val, counter))
                    elif al_protocol == 'SPL':
                        randomArray.append(counter)
        
        if use_pooled_budget ==1:
            size_limit = math.ceil(train_per_centage[loopCounter] * per_topic_budget_from_trec_qrels[topicID])
        else:
            size_limit = math.ceil(train_per_centage[loopCounter] * total_documents)
        if al_protocol == 'SPL':
            random.shuffle(randomArray)
            batch_counter = 0
            while True:
                if train_size_controller == size_limit:
                    break

                itemIndex = randomArray[batch_counter]
                isPredictable[itemIndex] = 0
                train_index_list.append(test_index_dictionary[itemIndex])
                train_size_controller += 1
                batch_counter += 1
        else:
            while not my_queue.empty():
                if train_size_controller == size_limit:
                    break

                item = my_queue.get()
                # is a tuple where item[1] is the index, item[0] is the predict value
                isPredictable[item[1]] = 0 # not predictable

                train_index_list.append(test_index_dictionary[item[1]])
                train_size_controller += 1

        loopCounter += 1
    return topic_all_info


## def active_learning

In [None]:
def active_learning(topic_list, df, al_protocol, al_classifier, document_collection, topic_seed_info, 
                    topic_complete_qrels_address, train_per_centage, data_path, file_name, use_pooled_budget, 
                    per_topic_budget_from_trec_qrels, feature_type):
    num_workers = None
    workers = ProcessPool(processes=1)
    with tqdm(total=len(topic_list)) as pbar:
        partial_active_learning_multi_processing = partial(active_learning_multi_processing, df=df, al_protocol=al_protocol, al_classifier = al_classifier,
        document_collection=document_collection, topic_seed_info=topic_seed_info, topic_complete_qrels_address=topic_complete_qrels_address, 
        train_per_centage=train_per_centage, use_pooled_budget=use_pooled_budget, per_topic_budget_from_trec_qrels=per_topic_budget_from_trec_qrels, 
        feature_type=feature_type)
        for topic_all_info in tqdm(workers.imap_unordered(partial_active_learning_multi_processing, topic_list)):
            topicID = topic_all_info[0][0] # 0 is the loopCounter Index and 0 is the first tuple
            file_complete_path = data_path + file_name + str(topicID) + '.pickle'
            pickle.dump(topic_all_info, open(file_complete_path, "wb"))
            pbar.update()

## def text_preprocessing

In [None]:
def text_preprocessing(s):
    s = str(s)
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    s = s.replace("https", "")

    s = re.compile('RT @').sub('@', s, count=1)
    s = s.replace(":", "")
    s = s.tp.clean(s)

    tokens = nltk.word_tokenize(s)
    stems = []
    for item in tokens:
        stems.append(ps.stem(item))
    
    return stems