#### Adamantios Zaras AM: 06
#### Panagiotis Souranis AM: 17

# Description

In this part of the project, 

# Global

## Imports

In [3]:
!pip install scikit-multilearn && \
git clone https://github.com/hsoleimani/MLTM.git

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 5.8MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Cloning into 'MLTM'...
remote: Enumerating objects: 3597, done.[K
remote: Total 3597 (delta 0), reused 0 (delta 0), pack-reused 3597
Receiving objects: 100% (3597/3597), 26.76 MiB | 25.39 MiB/s, done.
Resolving deltas: 100% (349/349), done.


In [4]:
import warnings
from random import randint

import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats as sp
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from utils import load_dataset, hyperparameters_search
warnings.filterwarnings("ignore")


Using TensorFlow backend.


## Prepare Dataset

In [5]:
# Load dataset.
X_train, y_train, X_test, y_test, word_index = load_dataset(ngram_range=1, path='MLTM/Data/Delicious', maxlen=200)
labels = ['programming','style','reference','java','web','internet','culture',
'design','education','language','books','writing','computer','english','politics','history','philosophy',
'science','religion','grammar']

# Get the most frequent class only.
print('\nGetting the most frequent class...')
most_frequent_counts = np.sum(np.transpose(y_train),axis=1)
most_frequent_index = most_frequent_counts.argmax()
y_train = y_train[:, most_frequent_index]
y_test = y_test[:, most_frequent_index]
print('The most frequent class was the word \'{}\', with {} appearances.'
      .format(labels[most_frequent_index], most_frequent_counts.max()))

# Split test set to test and unlabeled.
print('Splitting test data to test and unlabeled sets.')
X_unlabeled, X_test, y_hidden, y_test = train_test_split(
    X_test, y_test, test_size=.5, random_state=0)
print('{} test sequences.'.format(X_test.shape[0]))
print('{} unlabeled sequences.'.format(X_unlabeled.shape[0]))

Loading data...
8251 train sequences
3983 test sequences
X_train shape: (8251, 200)
X_test shape: (3983, 200)

Getting the most frequent class...
The most frequent class was the word 'reference', with 3181 appearences.
Splitting test data to test and unlabeled sets.
1992 test sequences.
1991 unlabeled sequences.


# Hyperparameters search

In [0]:
# Define classifiers.
classifiers = {
        'SVM': LinearSVC(random_state=0),
        'Tree': DecisionTreeClassifier(random_state=0),
        'Bayes': MultinomialNB()
}

## Random Search

In [0]:
# Create param dists.
svm_param_dist = {'C': 10 ** np.random.uniform(-3, 3, size=7000)}
tree_param_dist = {'max_depth': scipy.stats.randint(1, 30),
                   'max_features': scipy.stats.randint(1, X_train.shape[1]),
                   'min_samples_split': scipy.stats.randint(2, X_train.shape[0] / 2),
                   'criterion': ['gini', 'entropy']
}
# Add param dists to a list.
params_list = [svm_param_dist, tree_param_dist]

# Perform random search.
for key, classifier ,params in zip(classifiers.keys(), classifiers.values(), params_list):
    hyperparameters_search(classifier, params, X_train, y_train, 'Accuracy',
                           {'Accuracy': make_scorer(accuracy_score)}, key, 
                           candidates=100, cv=5, random_search=True, verbose=5)


Εstimator : SVM
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  5.2min


## Grid Search

In [13]:
# Create parameter grids.
svm_grid = {'C': np.arange(290, 310, .5)}
tree_grid = {
        'max_depth': range(1, 2),
        'max_features': range(129, 131),
        'min_samples_split': range(3868, 3872),
        'criterion': ['gini']
}
bayes_grid = {'alpha': np.arange(0, 10, 0.2)}
# Add param grids to a list.
params_list = [svm_grid, tree_grid, bayes_grid]

# Perform grid search.
for key, classifier ,params in zip(classifiers.keys(), classifiers.values(), params_list):
    hyperparameters_search(classifier, params, X_train, y_train, 'Accuracy',
                           {'Accuracy': make_scorer(accuracy_score)}, key, 
                           cv=10, random_search=False, verbose=10)


Εstimator : SVM
Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 11

Best parameters found for Estimator : SVM
{'C': 290.0}

Best score found for Accuracy Score metric : 0.505

Εstimator : Tree
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1517s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    5.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0933s.) Setting batch_size=4.


Best parameters found for Estimator : Tree
{'criterion': 'gini', 'max_depth': 1, 'max_features': 129, 'min_samples_split': 3868}

Best score found for Accuracy Score metric : 0.614

Εstimator : Bayes
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 260 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 364 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 424 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 484 tasks      | elapsed:   15.7s


Best parameters found for Estimator : Bayes
{'alpha': 4.800000000000001}

Best score found for Accuracy Score metric : 0.491


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   16.2s finished


## Final classifier

In [0]:
# Create the best classifier found from the search.
clf = LinearSVC(C=10, random_state=0)

# Apply Method

In [0]:
# Initialize lists to hold the accuracy results of the two methods.
uncertainty_accuracies, random_accuracies = [], []

# Initialize uncertainty sampling data.
X_train_us = X_train.copy()
y_train_us = y_train.copy()
X_unlabeled_us = X_unlabeled.copy()
y_unlabeled_us = y_hidden.copy()

# Initialize random sampling data.
X_train_rs = X_train.copy()
y_train_rs = y_train.copy()
X_unlabeled_rs = X_unlabeled.copy()
y_unlabeled_rs = y_hidden.copy()

# Run uncertainty sampling and random sampling methods, for 10 iterations.
n_iterations = 10
for i in range(n_iterations):
    print('Iteration {}/{}'.format(i + 1, n_iterations))

    print('Fitting on uncertainty sampling training set...')
    # Train classifier with uncertainty sampling training set.
    clf.fit(X_train_us, y_train_us)
    # Predict on test data.
    y_pred_us = clf.predict(X_test)
    # Get the most uncertain sample from the unlabeled pool, 
    # by choosing the sample which is closest to the hyperplane.
    uncertain_sample = X_test[np.argmin(np.abs(clf.decision_function(X_unlabeled_us)))]
    # Calculate accuracy.
    acc_us = accuracy_score(y_pred_us, y_test)
    # Append current accuracy to the uncertainty accuracies array.
    uncertainty_accuracies.append(acc_us)
    # Store accuracy in uncertainty sampling accuracies.
    uncertainty_accuracies.append(acc_us)
    # Update uncertainty sampling data.
    X_train_us = np.vstack((X_train_us, X_unlabeled_us[uncertain_sample, :]))
    y_train_us = np.hstack((y_train_us, y_unlabeled_us[uncertain_sample]))
    X_unlabeled_us = np.delete(X_unlabeled_us, uncertain_sample, 0)
    y_unlabeled_us = np.delete(y_unlabeled_us, uncertain_sample, 0)
    print('Predicted accuracy was {}'.format(acc_us))

    print('Fitting on random sampling training set...')
    # Train classifier with random sampling training set.
    clf.fit(X_train_rs, y_train_rs)
    # Predict on test data. 
    y_pred_rs = clf.predict(X_test)
    # Get a random unlabeled sample.
    random_sample = randint(0, len(y_unlabeled_rs))
    # Calculate accuracy.
    acc_rs = accuracy_score(y_pred_rs, y_test)
    # Append current accuracy to the random accuracies array.
    random_accuracies.append(acc_rs)
    # Store accuracy in random sampling accuracies.
    random_accuracies.append(acc_rs)
    # Update random sampling data.
    X_train_rs = np.vstack((X_train_rs, X_unlabeled_rs[random_sample, :]))
    y_train_rs = np.hstack((y_train_rs, y_unlabeled_rs[random_sample]))
    X_unlabeled_rs = np.delete(X_unlabeled_rs, random_sample, 0)
    y_unlabeled_rs = np.delete(y_unlabeled_rs, random_sample, 0)
    print('Predicted accuracy was {}'.format(acc_rs))

## Plot Results

Finally, we plot the learning curves of the two methods, in order to compare them.

In [0]:
# Create a figure.
plt.figure()
# Set title and labels.
plt.title('Uncertainty Sampling vs Random Sampling Learning Curves')
plt.xlabel('Number Of Instance Queries')
plt.ylabel('Accuracy')

# Create a plot grid.
plt.grid()
# Plot uncertainty and random sampling learning curves.
plt.plot(range(n_iterations), uncertainty_accuracies, 'o-', color="r",
         label="Uncertainty Sampling")
plt.plot(range(n_iterations), random_accuracies, 'o-', color="g",
         label="Random Sampling")

# Show legends, placed at the best possible location.
plt.legend(loc="best")

As we can see, 