#### Travail B

In [152]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import accuracy_score
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np 
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

#### DBScan
> Chargement des données

In [2]:
path_vote = './vote.csv'
df_vote = pd.read_csv(path_vote)
mapping = {'y': 1, 'n':0, '?':None, 'republican':1, 'democrat':0}
df_vote = df_vote.transform(lambda x: x.apply(lambda c: mapping[c]))
incomplete_rows = df_vote.isnull().any(axis=1).sum()
print(f'There are {incomplete_rows} with at least 1 missing value, out of {len(df_vote)} rows')
df_vote.dropna(inplace=True)
columns_vote = df_vote.columns
X_vote, y_vote = df_vote[columns_vote[:-1]], df_vote[columns_vote[-1]]

There are 203 with at least 1 missing value, out of 435 rows


> L'accuracy est calculée uniquement pour les données qui ont un cluster.

In [102]:
def get_accuracy(y_true, y_pred):
    """ 
    Get accuracy for instances that have a cluster
    """
    return max(s:=(sum(p:=[x == y for x,y in zip(y_true, y_pred) if y in [0,1]])/len(p)), 1-s)

def get_global_accuracy(y_true, y_pred):
    """ 
    Get accuracy by considering that an instance without cluster is wrongly classified
    """
    return max(sum(x == y for x,y in zip(y_true, y_pred) if y in [0,1]), sum(x != y for x,y in zip(y_true, y_pred) if y in [0,1])) / len(y_pred)

In [126]:
accuracies = []
global_accuracies = []
not_classified = []
number_points = [2,3,4,6]
n_clusters = []
for min_points in number_points:
    clustering = DBSCAN(eps=1.02, min_samples=min_points).fit(X_vote.values)
    accuracies.append(get_accuracy(y_vote, clustering.labels_))
    global_accuracies.append(get_global_accuracy(y_vote, clustering.labels_))
    not_classified.append(sum(y == -1 for y in clustering.labels_))
    n_clusters.append(len(np.unique(clustering.labels_)))
print(f'not classified: {not_classified}')
print(f'accuracies = {accuracies}')
print(f'global accuracies = {global_accuracies}')
print(f'Nombre de clusters (en comptant le cluster des instances sans cluster): {n_clusters}')

not classified: [61, 71, 79, 83]
accuracies = [0.9813664596273292, 0.9813664596273292, 0.9869281045751634, 0.9865771812080537]
global accuracies = [0.6810344827586207, 0.6810344827586207, 0.6508620689655172, 0.6336206896551724]
Nombre de clusters (en comptant le cluster des instances sans cluster): [8, 3, 3, 3]


In [104]:
def argmax(list):
    return max((x,i) for i,x in enumerate(list))[1]
best_min_points = number_points[argmax(global_accuracies)]

3

> Calcul du meilleur modèle

In [135]:
best_db_scan = DBSCAN(eps=1.02, min_samples=best_min_points).fit(X_vote.values)
best_acc = get_global_accuracy(y_vote, best_db_scan.labels_)
labels_true_pred = [(t,p)for t,p in zip(y_vote, best_db_scan.labels_) if p != -1]
cm = confusion_matrix([x[0] for x in labels_true_pred], [abs(x[1]-1) for x in labels_true_pred])
print(f'accuracy: {best_acc}\nconfusion matrix: \n{cm}')
print(f'Number not classified: {sum(x == -1 for x in best_db_scan.labels_)}')

accuracy: 0.6810344827586207
confusion matrix: 
[[76  3]
 [ 0 82]]
Number not classified: 71


> On a donc 0 Democrat mal placé et 1 Republican mal placé, pour ceux qui ont été classés

> Recherche de la meilleure valeur de epsilon

In [122]:
accuracies = []
global_accuracies = []
not_classified = []
epsilons = np.arange(1.00, 1.02, 0.001)
n_clusters = 0
for eps in epsilons:
    clustering = DBSCAN(eps=eps, min_samples=best_min_points).fit(X_vote.values)
    accuracies.append(get_accuracy(y_vote, clustering.labels_))
    global_accuracies.append(get_global_accuracy(y_vote, clustering.labels_))
    not_classified.append(sum(y == -1 for y in clustering.labels_))
    n_clusters = len(np.unique(clustering.labels_))
print(f'not classified: {not_classified}')
print(f'accuracies = {accuracies}')
print(f'global accuracies = {global_accuracies}')
print(f'n_clusters = {n_clusters}')

not classified: [71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71]
accuracies = [0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292]
global accuracies = [0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207]
n_clusters = 3


> Dans ce cas, changer epsilon n'a pas d'impact sur le résultat. <br>
> Il semble que les paramètres n'ont pas le même impact sur Weka et python. <br>
> Une grid-search ne peut être faite car une fonction de cout n'est pas défini pour le clustering dans scikit-learn (cela pourrait être implémenté à la main dans notre exemple, et c'est plus ou moins ce qu'on fait avec les 2 boucles sur epsilon et min_sample, qui sont cependant séparées). <br>

> J'ai changé les valeurs de epsilons pour avoir des résultats différents.

In [130]:
accuracies = []
global_accuracies = []
not_classified = []
epsilons = np.arange(.1, 2.00, 0.1)
n_clusters = []
for eps in epsilons:
    clustering = DBSCAN(eps=eps, min_samples=best_min_points).fit(X_vote.values)
    accuracies.append(get_accuracy(y_vote, clustering.labels_))
    global_accuracies.append(get_global_accuracy(y_vote, clustering.labels_))
    not_classified.append(sum(y == -1 for y in clustering.labels_))
    n_clusters.append(len(np.unique(clustering.labels_)))
print(f'not classified: {not_classified}')
print(f'accuracies = {accuracies}')
print(f'global accuracies = {global_accuracies}')
print(f'n_clusters = {n_clusters}')
best_epsilon = epsilons[argmax(global_accuracies)]
print(f'best epsilon: {best_epsilon}')

not classified: [148, 148, 148, 148, 148, 148, 148, 148, 148, 71, 71, 71, 71, 71, 21, 21, 21, 1, 1]
accuracies = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.9813664596273292, 0.927536231884058, 0.927536231884058, 0.927536231884058, 0.5324675324675324, 0.5324675324675324]
global accuracies = [0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.05172413793103448, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.6810344827586207, 0.8275862068965517, 0.8275862068965517, 0.8275862068965517, 0.5301724137931034, 0.5301724137931034]
n_clusters = [20, 20, 20, 20, 20, 20, 20, 20, 20, 3, 3, 3, 3, 3, 4, 4, 4, 2, 2]
best epsilon: 1.7000000000000002


> Calcul du meilleur modèle avec best epsilon et best_min_points

In [136]:
best_db_scan = DBSCAN(eps=best_epsilon, min_samples=best_min_points).fit(X_vote.values)
best_acc = get_global_accuracy(y_vote, best_db_scan.labels_)
labels_true_pred = [(t,p)for t,p in zip(y_vote, best_db_scan.labels_) if p != -1]
cm = confusion_matrix([x[0] for x in labels_true_pred], [abs(x[1]-1) for x in labels_true_pred])
print(f'accuracy: {best_acc}\nconfusion matrix: \n{cm}')
print(f'Number not classified: {sum(x == -1 for x in best_db_scan.labels_)}')

accuracy: 0.8275862068965517
confusion matrix: 
[[95 15]
 [ 4 97]]
Number not classified: 21


> Ce dernier modèle est meilleur que le précédent car moins de données sont non classifiées, et l'accuracy globale est meilleure. <br>
>> Democrat classifié mal classifié: 4 <br>
>> Republican classifié mal classifié: 15

#### Méthode probabiliste EM

> Chargement des données

In [145]:
path_weather = './weather.nominalToBinary.csv'
df_weather = pd.read_csv(path_weather)
df_weather['play'] = df_weather['play'].apply(lambda x: 0 if x == 'no' else 1)
weather_columns = df_weather.columns
X, y = df_weather[weather_columns[:-1]], df_weather[weather_columns[-1]]
n_classes = 2

2

> Calcul du modèle

In [154]:
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
predictions = gm.predict(X)
def print_metrics(y_train, y_pred):
    accuracy = sum(y_pred == y_train.to_numpy())/len(y_pred)
    print(f'accuracy = {accuracy:.2f}')
    print(f'confusion matrix: \n{confusion_matrix(y_train, y_pred)}')
    print(classification_report(y_train, y_pred))
print_metrics(y, predictions)

accuracy = 0.57
confusion matrix: 
[[3 2]
 [4 5]]
              precision    recall  f1-score   support

           0       0.43      0.60      0.50         5
           1       0.71      0.56      0.63         9

    accuracy                           0.57        14
   macro avg       0.57      0.58      0.56        14
weighted avg       0.61      0.57      0.58        14



> Variation de la seed

In [157]:
accuracies = []
seeds = [0,5,10,20,100]
for s in seeds:
    gm = GaussianMixture(n_components=2, random_state=0).fit(X)
    predictions = gm.predict(X)
    accuracies.append(accuracy_score(y, predictions))
accuracies

[0.5714285714285714,
 0.5714285714285714,
 0.5714285714285714,
 0.5714285714285714,
 0.5714285714285714]

> Il n'y a pas de différence dans les résultats

In [158]:
confusion_matrix(y, predictions)

array([[3, 2],
       [4, 5]])

> no mal placés: 2 <br>
> yes mal placés: 4