#### Adamantios Zaras AM: 06
#### Panagiotis Souranis AM: 17

# Description

In this part of the project, 

# Preparation

## Imports

In [0]:
!git clone https://github.com/hsoleimani/MLTM.git && \
pip install pyclustering

In [0]:
import warnings
from random import randint, sample

import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
import scipy.stats as sp
from scipy.spatial.distance import directed_hausdorff as hausdorff_distance
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils.metric import distance_metric, type_metric

from utils import hyperparameters_search
from k_medoids import KMedoidsHaussdorff
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

## Prepare Dataset

In [0]:
def create_bags_of_sentences(documents_path: str, labels_path: str) -> pd.DataFrame:
    """
    Creates a bag of sentences.

    :param documents_path: the path to the documents.
    :param labels_path: the path to the labels.
    :return: Pandas Dataframe containing the bag of sentences.
    """
    # Get the most frequent class only.
    labels = pd.read_csv(labels_path, header=None)
    labels = labels[0].map(lambda x: np.array([int(lab) for lab in x.split()]))
    labels = np.array(labels.tolist())
    most_frequent_counts = np.sum(np.transpose(labels), axis=1)
    most_frequent_index = most_frequent_counts.argmax()
    labels = labels[:, most_frequent_index]

    # Open documents file.
    documents_file = open(documents_path, 'r')

    # Initialize counters and bag of sentences dictionary.
    document_counter = 0
    sentence_count = 0
    bag_of_sentences = {}

    for document, label in zip(documents_file, labels):
        # Parse document.
        parsed_document = re.split(r'<\d+>', document)

        for sentence in parsed_document:
            # Remove leading and trailing whitespaces.
            sentence = sentence.strip()

            # If sentence is not empty.
            if sentence:
                # Store words to an array of ints, since they are just ids.
                words = np.asarray(sentence.split(" "), dtype=np.int32)
                # Add a sentence to the bag.
                bag_of_sentences[sentence_count] = (document_counter, words, label)
                sentence_count += 1

        document_counter += 1

    # Close documents file.
    documents_file.close()

    # Create dataframe of the bag.
    df = pd.DataFrame.from_dict(bag_of_sentences, orient='index',
                                columns = ['Bag', 'Sentence', 'Class'])

    return df

# Create bags of sentences for the train and test data.
train_df = create_bags_of_sentences('MLTM/Data/Delicious/train-data.dat', 
                           'MLTM/Data/Delicious/train-label.dat')
test_df = create_bags_of_sentences('MLTM/Data/Delicious/test-data.dat', 
                          'MLTM/Data/Delicious/test-label.dat')

Demonstrate data structure.

In [3]:
train_df.head()

Unnamed: 0,Bag,Sentence,Class
0,0,"[6705, 5997, 8310, 3606, 674, 8058, 5044, 4836]",1
1,0,"[4312, 5154, 8310, 4225]",1
2,1,"[1827, 1037, 8482, 483]",1
3,1,"[3567, 6172, 6172, 2892, 1362, 787, 399, 777, ...",1
4,1,"[318, 769, 4621, 3199, 1480, 6213, 971, 6890]",1


In [0]:
def create_bag_per_document(df: pd.DataFrame):
    """
    Parse a sentences dataframe 
    and get a bag of sentences for each document, with its labels.

    :param df: the dataframe.
    :return: the data and the labels.
    """
    ids, X, y = np.array(df['Bag']), np.array(df['Sentence']), np.array(df['Class'])

    X = pad_sequences(X, maxlen=200)
    un_id = np.unique(ids)

    data = []
    labels = []
    for i in range(un_id.shape[0]):
        bag = X[np.where(ids == i)]
        data.append(bag)
        label = y[np.where(ids == i)]
        labels.append(label)
    data = np.array(data)
    labels = np.array(labels)
    labels = np.array([labels[i][0] for i in range(labels.shape[0])])

    return data, labels

# Get the bags and the labels.
train_bag, y_train = create_bag_per_document(train_df)
test_bag, y_test = create_bag_per_document(test_df)

## Transform Problem

We transform the problem, using the K-medoids approach.

In [0]:
def hausdorff_symmetric(x, y):
    """Calculate symmetric hausdorff distance."""
    return max(hausdorff_distance(x, y)[0], hausdorff_distance(y, x)[0])

# Initialize distances matrix.
print('Calculating Hausdorff distances...')
n_data = train_bag.shape[0]
distance_matrix = np.empty((n_data, n_data))

# Calculate symmetric haussdorff distances.
for outer, x in enumerate(train_bag):
    for inner, y in enumerate(train_bag):
        distance_matrix[outer, inner] = hausdorff_symmetric(x, y)

# Set number of clusters.
k = 3
print('Applying K Medoids for {} clusters...'.format(k))
# Randomly initialize initial medoids.
initial_medoid_indices = sample(range(distance_matrix.shape[0]), k)
# Run K-Medoids and get the final medoids.
k_medoids = kmedoids(distance_matrix, initial_medoid_indices, data_type='distance_matrix')
k_medoids.process()
final_medoids = train_bag[k_medoids.get_medoids()]

def generate_features(data_bag, medoids) -> np.ndarray:
    """
    Generates features from a bag of instances and some medoids.

    :param data_bag: the bag of instances.
    :param medoids: the medoids features.
    :return: a numpy array containing the data with their generated features.
    """
    data_transformed = np.empty((len(data_bag), len(medoids)))

    # Generate features, using the distances from the medoids.
    for i, x in enumerate(data_bag):
        for j, medoid in enumerate(medoids):
            data_transformed[i][j] = hausdorff_symmetric(x, medoid)

    # Normalize distance features to [0, 1].
    scaler = MinMaxScaler()
    scaler.fit_transform(data_transformed)

    return data_transformed

# Transform data.
print('Transforming data...')
X_train_transformed = generate_features(train_bag, final_medoids)
X_test_transformed = generate_features(test_bag, final_medoids)

Calculating Hausdorff distances...


Demonstrate new dataset shape.

In [0]:
print('New X train data shape: {}'.format(X_train_transformed.shape))
print('New X test  data shape: {}'.format(X_test_transformed.shape))

# Hyperparameters search

In [0]:
# Define classifiers.
classifiers = {
        'SVM': LinearSVC(random_state=0),
        'Tree': DecisionTreeClassifier(random_state=0),
        'Bayes': MultinomialNB()
}

## Random Search

In [0]:
# Create param dists.
svm_param_dist = {'C': 10 ** np.random.uniform(-3, 3, size=7000)}
tree_param_dist = {'max_depth': scipy.stats.randint(1, 30),
                   'max_features': scipy.stats.randint(1, X_train_transformed.shape[1]),
                   'min_samples_split': scipy.stats.randint(2, X_train_transformed.shape[0] / 2),
                   'criterion': ['gini', 'entropy']
}
# Add param dists to a list.
params_list = [svm_param_dist, tree_param_dist]

# Perform random search.
for key, classifier ,params in zip(classifiers.keys(), classifiers.values(), params_list):
    hyperparameters_search(classifier, params, X_train_transformed, y_train, 'Accuracy',
                           {'Accuracy': make_scorer(accuracy_score)}, key, 
                           candidates=100, cv=5, random_search=True, verbose=5)


Εstimator : SVM
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 37.0min finished


Best parameters found for Estimator : SVM
{'C': 118.2089155014123}

Best score found for Accuracy Score metric : 0.560

Εstimator : Tree
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 163 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 289 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished


Best parameters found for Estimator : Tree
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 49, 'min_samples_split': 2646}

Best score found for Accuracy Score metric : 0.616


## Grid Search

In [0]:
# Create parameter grids.
svm_grid = {'C': np.arange(110, 130, .5)}
tree_grid = {
        'max_depth': range(5, 9),
        'max_features': range(45, 55),
        'min_samples_split': range(2645, 2648),
        'criterion': ['gini']
}
bayes_grid = {'alpha': np.arange(0, 10, 0.2)}
# Add param grids to a list.
params_list = [svm_grid, tree_grid, bayes_grid]

# Perform grid search.
for key, classifier ,params in zip(classifiers.keys(), classifiers.values(), params_list):
    hyperparameters_search(classifier, params, X_train_transformed, y_train, 'Accuracy',
                           {'Accuracy': make_scorer(accuracy_score)}, key, 
                           cv=10, random_search=False, verbose=10)


Εstimator : SVM
Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 11

Best parameters found for Estimator : SVM
{'C': 110.0}

Best score found for Accuracy Score metric : 0.505

Εstimator : Tree
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1638s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 110 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 214 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 244 tas

Best parameters found for Estimator : Tree
{'criterion': 'gini', 'max_depth': 7, 'max_features': 54, 'min_samples_split': 2645}

Best score found for Accuracy Score metric : 0.614

Εstimator : Bayes
Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0799s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 260 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 364 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 424 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 484 tas

Best parameters found for Estimator : Bayes
{'alpha': 4.800000000000001}

Best score found for Accuracy Score metric : 0.491


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   15.5s finished


## Final classifier

In [0]:
# Create the best classifier found from the search.
clf = DecisionTreeClassifier(criterion='gini', max_depth=7, max_features=54, 
                             min_samples_split=2645, random_state=0)

# Results

In [0]:
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

print('Final Results')
print('---------------------')
print('Accuracy       {:.4f}'
      .format(accuracy_score(y_test, y_pred)))
print('Precision      {:.4f}'
      .format(precision_score(y_test, y_pred, average=averaging)))
print('Recall         {:.4f}'
      .format(recall_score(y_test, y_pred, average=averaging)))
print('F1             {:.4f}'
      .format(f1_score(y_test, y_pred, average=averaging)))

# Conclusion