# Laboratoire 4 : Développement d’un système intelligent
#### Département du génie logiciel et des technologies de l’information

| Étudiants             | Alexandre Laroche - LARA12078907<br>Marc-Antoine Charland - CHAM16059609<br>Jonathan Croteau-Dicaire - CROJ10109402    |
|-----------------------|---------------------------------------------------------|
| Cours                 | GTI770 - Systèmes intelligents et apprentissage machine |
| Session               | Été 2019                                            |
| Groupe                | 02                                                      |
| Numéro du laboratoire | TP-04                                                   |
| Professeur            | Prof. Alessandro L. Koarich                             |
| Chargé de laboratoire | Pierre-Luc Delisle                                                     |
| Date                  | 5 août 2019 (23h55)                                                    |

# Random Forest

## Preparation

In [1]:
"""""""""""""""""""""""
" Classes externes
"""""""""""""""""""""""
# According to pep8, import should be structures as :

# Standard library imports
import os
from os import path
import pickle
from pprint import pprint
import time
# A single empty line

# 3rd party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# This project's modules imports
import src.constants as constants

In [11]:
"""""""""""""""""""""""
" Constantes
"""""""""""""""""""""""

# Repertoires des fichiers CSV (musique)
derivatives_file_path = constants.PROJECT_ROOT_PATH + 'data/raw/music/tagged_feature_sets/msd-jmirderivatives_dev/msd-jmirderivatives_dev.csv'
marsyas_file_path = 'data/raw/music/tagged_feature_sets/msd-marsyas_dev_new/msd-marsyas_dev_new.csv'
ssd_file_path = path.join(constants.DATA_PATH, 'ssd_base_split.csv')


## Fonctions

In [3]:
"""""""""""""""""""""""
" Functions
"""""""""""""""""""""""

def load_feature_set(file_path):
    # Lecture du fichier CSV
    x = np.array(pd.read_csv(file_path, header=None).values[:,2:-1])
    y = np.array(pd.read_csv(file_path, header=None).values[:,-1])
    
    return x, y


def scale_features(x):
    # Normalisation des features
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    return x


def one_hot_labels(y):
    # Hot encodage des labels
    y_simple = y.reshape(len(y), 1)
    hot_encoder = OneHotEncoder(sparse=False)
    y_hot_encoded = hot_encoder.fit_transform(y_simple)
    
    return y_hot_encoded


def label_encode(y):
    # Encodage des labels
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    
    return y_encoded


def compare_lda_score(x, y, random_forest, lda_params):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    accuracy_results = []
    f1_results = []
    
    for param in lda_params:
    
        # Performing LDA
        lda = LDA(n_components = lda_params[param])
        x_train = lda.fit_transform(x_train, y_train)
        x_test = lda.transform(x_test)

        # Entrainement et prediction du modele
        random_forest = rf
        random_forest.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)

        # Evalue la performance
        cm = confusion_matrix(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average = 'weighted', labels = np.unique(y_pred))
        
        # Insere les resultats dans les listes
        accuracy_results.append(accuracy)
        f1_results.append(f1)
        
    # Construction du tableau comparatif
    data = {
        "LDA n_components":lda_params,
        "Accuracy":accuracy_results,
        "F1 score":f1_results
    }
    
    df = pd.DataFrame(data)
    df = df.set_index("LDA n_components")
    return df


## Lectures CSV

In [12]:

"""""""""""""""""""""""
" CSV - Derivatives
"""""""""""""""""""""""

# Get the data from the CSV
derivatives_x, derivatives_y = load_feature_set(derivatives_file_path)

# Normalise les features
derivatives_x = scale_features(derivatives_x)

print("Derivatives dimension : ", derivatives_x.shape[1])
print("Derivatives songs qty. : ", len(derivatives_x))


Derivatives dimension :  96
Derivatives songs qty. :  179555


In [5]:
"""""""""""""""""""""""
" CSV - Marsyas
"""""""""""""""""""""""

# Get the data from the CSV
marsyas_x, marsyas_y = load_feature_set(marsyas_file_path)

# Normalise les features
marsyas_x = scale_features(marsyas_x)

print("Marsyas dimension : ", marsyas_x.shape[1])
print("Marsyas songs qty. : ", len(marsyas_x))


Marsyas dimension :  124
Marsyas songs qty. :  179555


In [4]:
"""""""""""""""""""""""
" CSV - SSD
"""""""""""""""""""""""

# Get the data from the CSV
ssd_x, ssd_y = load_feature_set(ssd_file_path)

# Normalise les features
ssd_x = scale_features(ssd_x)

print("SSD dimension : ", ssd_x.shape[1])
print("SSD songs qty. : ", len(ssd_x))


SSD dimension :  168
SSD songs qty. :  143644


## Verification des quantités

In [6]:
"""""""""""""""""""""""
" Class unique
"""""""""""""""""""""""
# Recupere les classes uniques
unique_class = list(set(ssd_y))
encoder.fit(unique_class)

# Affiche les classes uniques
print("Classes musicales (", len(unique_class), ") :\n\n", np.array(unique_class))


NameError: name 'encoder' is not defined

In [13]:
print(np.unique(derivatives_y, return_counts=True))

(array(['BIG_BAND', 'BLUES_CONTEMPORARY', 'COUNTRY_TRADITIONAL', 'DANCE',
       'ELECTRONICA', 'EXPERIMENTAL', 'FOLK_INTERNATIONAL', 'GOSPEL',
       'GRUNGE_EMO', 'HIP_HOP_RAP', 'JAZZ_CLASSIC', 'METAL_ALTERNATIVE',
       'METAL_DEATH', 'METAL_HEAVY', 'POP_CONTEMPORARY', 'POP_INDIE',
       'POP_LATIN', 'PUNK', 'REGGAE', 'RNB_SOUL', 'ROCK_ALTERNATIVE',
       'ROCK_COLLEGE', 'ROCK_CONTEMPORARY', 'ROCK_HARD',
       'ROCK_NEO_PSYCHEDELIA'], dtype=object), array([ 2047,  4511,  7316,  9885,  7148,  7932,  6465,  4580,  4096,
       10581,  6568,  9195,  6485,  7031,  8959, 11858,  5048,  6306,
        3433,  4107,  8333, 10856, 10829,  8720,  7266]))


In [5]:
"""""""""""""""""""""""
" Class proportion
"""""""""""""""""""""""
# Recupere la colonne des classes
class_proportion = ssd_data[len(ssd_data.columns) - 1]

# Proportion des classes de SSD
pd.DataFrame(class_proportion.value_counts())


NameError: name 'ssd_data' is not defined

## Random Forest - Evaluate best parameters

In [5]:
"""""""""""""""""""""""""""""
" SSD - Train & Test
"""""""""""""""""""""""""""""

x_train, x_test, y_train, y_test = train_test_split(ssd_x, ssd_y, test_size=0.2, random_state=42)


#### Random Forest - Default

In [9]:
"""""""""""""""""""""""""""""
" SSD - Default Random Forest
"""""""""""""""""""""""""""""

# Training and predictions
rf = RandomForestClassifier(random_state = 42)

start_fit_time = time.perf_counter()

rf.fit(x_train, y_train)

fit_time = time.perf_counter() - start_fit_time

y_pred = rf.predict(x_test)

# Evaluating the performance
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average = 'micro', labels = np.unique(y_pred))
f1_macro = f1_score(y_test, y_pred, average = 'macro', labels = np.unique(y_pred))

print("*** SSD - Best default Random Forest score ***\n")
print("Validation accuracy: ", accuracy)
print("F1 score micro: ", f1_micro)
print("F1 score macro: ", f1_macro)

print('\n*** Random Forest default parameters ***\n')
pprint(rf.get_params())




*** SSD - Best default Random Forest score ***

Accuracy:  0.2035866447606583
F1 score:  0.1916519990896498

*** Random Forest default parameters ***

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


#### Random Forest - Find best parameters

In [25]:
"""""""""""""""""""""""""""""
" SSD - Params to evaluate
"""""""""""""""""""""""""""""

model_params = {
    'bootstrap': [True, False],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 8, 10],
    'max_features': ['auto', 'sqrt'],
    'criterion': ["gini", "entropy"],
    'max_depth': [None, 10, 30, 50, 70, 80],
    'n_estimators': [10, 30, 60, 80, 100]
}


In [27]:
"""""""""""""""""""""""""""""
" SSD - Find best params
"""""""""""""""""""""""""""""

# Initialisation du modele
rf_model = RandomForestClassifier(verbose=2)

# Cherche a travers 100 combinaisons avec un fold = 5
clf = RandomizedSearchCV(
    cv = 5,
    n_jobs = -1,
    n_iter = 25,
    estimator = rf_model,
    param_distributions = model_params,
    verbose = 2
)

# Fit le model
clf.fit(x_train, y_train)


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 119.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 80


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


building tree 2 of 80
building tree 3 of 80
building tree 4 of 80
building tree 5 of 80
building tree 6 of 80
building tree 7 of 80
building tree 8 of 80
building tree 9 of 80
building tree 10 of 80
building tree 11 of 80
building tree 12 of 80
building tree 13 of 80
building tree 14 of 80
building tree 15 of 80
building tree 16 of 80
building tree 17 of 80
building tree 18 of 80
building tree 19 of 80
building tree 20 of 80
building tree 21 of 80
building tree 22 of 80
building tree 23 of 80
building tree 24 of 80
building tree 25 of 80
building tree 26 of 80
building tree 27 of 80
building tree 28 of 80
building tree 29 of 80
building tree 30 of 80
building tree 31 of 80
building tree 32 of 80
building tree 33 of 80
building tree 34 of 80
building tree 35 of 80
building tree 36 of 80
building tree 37 of 80
building tree 38 of 80
building tree 39 of 80
building tree 40 of 80
building tree 41 of 80
building tree 42 of 80
building tree 43 of 80
building tree 44 of 80
building tree 45 of

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  4.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [29]:
"""""""""""""""""""""""""""""
" SSD - Get the best params
"""""""""""""""""""""""""""""

bootstrap         = clf.best_params_['bootstrap']
min_samples_leaf  = clf.best_params_['min_samples_leaf']
min_samples_split = clf.best_params_['min_samples_split']
max_features      = clf.best_params_['max_features']   
criterion         = clf.best_params_['criterion']
max_depth         = clf.best_params_['max_depth']
n_estimators      = clf.best_params_['n_estimators']

print(clf.best_params_)

{'n_estimators': 80, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 80, 'criterion': 'gini', 'bootstrap': False}


#### Random Forest - Optimized

In [7]:
"""""""""""""""""""""""""""""
" SSD - Optimized Random Forest
"""""""""""""""""""""""""""""
bootstrap = False
min_samples_leaf = 1
min_samples_split = 4
max_features = 'auto'
criterion = 'gini'
max_depth = 80
n_estimators = 80


rf = RandomForestClassifier(bootstrap = bootstrap,
                            min_samples_leaf = min_samples_leaf,
                            min_samples_split = min_samples_split,
                            max_features = max_features,
                            criterion = criterion,
                            max_depth = max_depth,
                            n_estimators = n_estimators)

# Garde une copie du modele original (optimisé) et on le Fit
random_forest = rf

start_fit_time = time.perf_counter()

random_forest.fit(x_train, y_train)

fit_time = time.perf_counter() - start_fit_time

y_pred = rf.predict(x_test)

# Evalue la performance
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

f1_micro = f1_score(y_test, y_pred, average = 'micro', labels = np.unique(y_pred))
f1_macro = f1_score(y_test, y_pred, average = 'macro', labels = np.unique(y_pred))

print("*** SSD - Optimized Random Forest score ***\n")
print("Validation accuracy: ", accuracy)
print("F1 score micro: ", f1_micro)
print("F1 score macro: ", f1_macro)
print('fit time: ', str(fit_time))

print('\n*** Random Forest optimized parameters ***\n')
pprint(rf.get_params())


*** SSD - Optimized Random Forest score ***

Validation accuracy:  0.28462529151728216
F1 score micro:  0.28462529151728216
F1 score macro:  0.2583912545134321
fit time:  199.56459134200122

*** Random Forest optimized parameters ***

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 80,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [9]:
# train final RandomForest model on the whole dataset

final_rf = RandomForestClassifier(
    bootstrap = bootstrap,
    min_samples_leaf = min_samples_leaf,
    min_samples_split = min_samples_split,
    max_features = max_features,
    criterion = criterion,
    max_depth = max_depth,
    n_estimators = n_estimators,
    n_jobs = -1,
    verbose = 2
)

final_rf.fit(ssd_x, ssd_y)

# Sérialise le modèle
model_file_path = path.join(constants.MODELS_PATH, 'final_random_forest_2.pickle')
pickle.dump(final_rf, open(model_file_path, 'wb'))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 80building tree 2 of 80
building tree 3 of 80

building tree 4 of 80
building tree 5 of 80
building tree 6 of 80
building tree 7 of 80
building tree 8 of 80
building tree 9 of 80
building tree 10 of 80
building tree 11 of 80
building tree 12 of 80
building tree 13 of 80
building tree 14 of 80
building tree 15 of 80
building tree 16 of 80
building tree 17 of 80
building tree 18 of 80
building tree 19 of 80
building tree 20 of 80
building tree 21 of 80
building tree 22 of 80
building tree 23 of 80
building tree 24 of 80
building tree 25 of 80
building tree 26 of 80
building tree 27 of 80
building tree 28 of 80
building tree 29 of 80
building tree 30 of 80
building tree 31 of 80
building tree 32 of 80
building tree 33 of 80
building tree 34 of 80
building tree 35 of 80


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   20.7s


building tree 36 of 80
building tree 37 of 80
building tree 38 of 80
building tree 39 of 80
building tree 40 of 80
building tree 41 of 80
building tree 42 of 80
building tree 43 of 80
building tree 44 of 80
building tree 45 of 80
building tree 46 of 80
building tree 47 of 80
building tree 48 of 80
building tree 49 of 80
building tree 50 of 80
building tree 51 of 80
building tree 52 of 80
building tree 53 of 80
building tree 54 of 80
building tree 55 of 80
building tree 56 of 80
building tree 57 of 80
building tree 58 of 80
building tree 59 of 80
building tree 60 of 80
building tree 61 of 80
building tree 62 of 80
building tree 63 of 80
building tree 64 of 80
building tree 65 of 80
building tree 66 of 80
building tree 67 of 80
building tree 68 of 80
building tree 69 of 80
building tree 70 of 80
building tree 71 of 80
building tree 72 of 80
building tree 73 of 80
building tree 74 of 80
building tree 75 of 80
building tree 76 of 80
building tree 77 of 80
building tree 78 of 80
building tr

[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   53.5s finished


## Random Forest - Evaluate best dimension reduction

#### Random Forest - Default PCA & LDA

In [None]:
"""""""""""""""""""""""""""""
" SSD - Default PCA
"""""""""""""""""""""""""""""

# Performing PCA
pca = PCA()
x_train = pca.fit_transform(x_train, y_train)
x_test = pca.transform(x_test)

# Evite d'utiliser la copie original du modele et on l'entraine
random_forest =  RandomForestClassifier(
    bootstrap = bootstrap,
    min_samples_leaf = min_samples_leaf,
    min_samples_split = min_samples_split,
    max_features = max_features,
    criterion = criterion,
    max_depth = max_depth,
    n_estimators = n_estimators,
    n_jobs = -1,
    verbose = 2
)
random_forest.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# Evaluating the performance
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average = 'micro', labels = np.unique(y_pred))
f1_macro = f1_score(y_test, y_pred, average = 'macro', labels = np.unique(y_pred))

print("*** Best SSD Score (default PCA) ***\n")
print("Validation accuracy: ", accuracy)
print("F1 score micro: ", f1_micro)
print("F1 score macro: ", f1_macro)


In [None]:
"""""""""""""""""""""""""""""
" SSD - Default LDA
"""""""""""""""""""""""""""""

# Performing LDA
lda = LDA()
x_train = lda.fit_transform(x_train, y_train)
x_test = lda.transform(x_test)

# Evite d'utiliser la copie original du modele et on l'entraine
random_forest = RandomForestClassifier(
    bootstrap = bootstrap,
    min_samples_leaf = min_samples_leaf,
    min_samples_split = min_samples_split,
    max_features = max_features,
    criterion = criterion,
    max_depth = max_depth,
    n_estimators = n_estimators,
    n_jobs = -1,
    verbose = 2
)
random_forest.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# Evalue la performance
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average = 'weighted', labels = np.unique(y_pred))

print("*** Best SSD Score (default LDA) ***\n")
print("Accuracy: ", accuracy)
print("F1 score: ", f1)


#### Random Forest - Evaluate best LDA parameters

In [61]:
"""""""""""""""""""""""""""""
" SSD - Best LDA params
"""""""""""""""""""""""""""""

# Evite d'utiliser la copie original du modele
randomforest = rf

# Affiche un tableau des resultats selon les parametres LDA entrés
compare_lda_score(ssd_x, ssd_y, [20, 22, 24, 26, 28, 30])


Unnamed: 0_level_0,Accuracy,F1 score
LDA n_components,Unnamed: 1_level_1,Unnamed: 2_level_1
14,0.241514,0.205522
16,0.243017,0.212112
18,0.241013,0.204155


In [None]:
"""""""""""""""""""""""""""""
"Derivatives - Best LDA params
"""""""""""""""""""""""""""""

# Evite d'utiliser la copie original du modele
randomforest = rf

# Affiche un tableau des resultats selon les parametres LDA entrés
compare_lda_score(derivatives_x, derivatives_y, [12, 14, 16, 18, 20, 22])


In [None]:
"""""""""""""""""""""""""""""
" Marsyas - Best LDA params
"""""""""""""""""""""""""""""

# Evite d'utiliser la copie original du modele
randomforest = rf

# Affiche un tableau des resultats selon les parametres LDA entrés
compare_lda_score(marsyas_x, marsyas_y, [12, 14, 16, 18, 20, 22])
