In [66]:
import numpy as np
import pandas as pd
import sys

from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, margin_sampling, entropy_sampling, classifier_uncertainty

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from IPython import display
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [67]:
df = pd.read_csv(r'allUsers.lcl.csv')
df_trimmed = df.iloc[1:,:11]
dataset = df_trimmed.to_numpy()
# import dataset
np.random.shuffle(dataset)

X = dataset[:,2:]
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
X = scaler.fit_transform(X)

y = dataset[:,0].reshape(-1,1)
# y = scaler.fit_transform(y)

# the dataset is broken as follows: 9% for training, 1% for 10-fold cross validating, 2 person for test set and the
# the rest is treated as the pool for active learning
n_train = round(0.005 * X.shape[0])
n_val = round(0.01 * X.shape[0])
n_test = round(0.01 * X.shape[0])

X_train = X[:n_train,:]
y_train = y[:n_train]

X_test = X[n_train : n_train + n_test, :]
y_test = y[n_train : n_train + n_test]

X_pool = X[n_train + n_test :, :]
y_pool = y[n_train + n_test :]

In [69]:
query_strats = [uncertainty_sampling, margin_sampling, entropy_sampling]

for query_strat in query_strats:
    X_pool_temp = X_pool
    y_pool_temp = y_pool
    print('***')
    # initialising the activelearner module
    learner = ActiveLearner(
        estimator=RandomForestClassifier(),
        query_strategy=query_strat,
        X_training=X_train, y_training=y_train
    )

    accuracy_score = [learner.score(X_test, y_test)]
    print(accuracy_score)


    n_queries = 200
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_pool_temp)
    #     print(query_idx)
        query_instance = query_instance.reshape(1,-1)
        query_label = y_pool_temp[query_idx].reshape(1,-1)
        learner.teach(query_instance, query_label)
        X_pool_temp = np.delete(X_pool_temp, query_idx, 0)
        y_pool_temp = np.delete(y_pool_temp, query_idx, 0)


    accuracy_score_final = [learner.score(X_test, y_test)]
    print(accuracy_score_final)

***
[0.6427656850192062]
[0.6939820742637645]
***
[0.6043533930857875]
[0.6901408450704225]
***
[0.6120358514724712]
[0.6734955185659411]


In [71]:
from modAL.models import Committee
from modAL.disagreement import vote_entropy_sampling, max_disagreement_sampling, consensus_entropy_sampling

# a list of ActiveLearners:
learner1 = ActiveLearner(
        estimator=RandomForestClassifier(),
        X_training=X_train, y_training=y_train
    )

learner2 = ActiveLearner(
        estimator=LogisticRegression(),
        X_training=X_train, y_training=y_train
    )

learners = [learner1, learner2]

query_strats = [vote_entropy_sampling, max_disagreement_sampling, consensus_entropy_sampling]

for query_strat in query_strats:
    print('***')
    print(query_strat)
    committee = Committee(
        learner_list=learners,
        query_strategy=query_strat
    )

    accuracy_score = [committee.score(X_test, y_test)]
    print(accuracy_score)

    X_pool_temp = X_pool
    y_pool_temp = y_pool

    n_queries = 200
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_pool_temp)
#         print(query_idx)
        query_instance = query_instance.reshape(1,-1)
        query_label = y_pool_temp[query_idx].reshape(1,-1)
        committee.teach(query_instance, query_label)
        X_pool_temp = np.delete(X_pool_temp, query_idx, 0)
        y_pool_temp = np.delete(y_pool_temp, query_idx, 0)

    accuracy_score_final = [committee.score(X_test, y_test)]
    print(accuracy_score_final)

***
<function vote_entropy_sampling at 0x19884300>
[0.6414852752880922]
[0.6875800256081946]
***
<function max_disagreement_sampling at 0x19884390>
[0.6875800256081946]
[0.6991037131882202]
***
<function consensus_entropy_sampling at 0x19884348>
[0.6991037131882202]
[0.6837387964148528]
