In [6]:
# -*- coding: utf-8 -*-
#
# Telecom ParisTech : https://www.telecom-paristech.fr
# Projet PAF - 2018 : https://paf.telecom-paristech.fr
#
# Antoine Bellami
# Aurelien Blicq
# Clement Bonet
# Benoit Malezieux
# Louis Penet de Monterno
# Bastien Vagne


import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics

class PafKmeans:
    """ class with attributs a KMeans objects 
        the dataframe on which we use kmean
        the number of clusters we want
    """
    
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.model = None
        self.number = 4
        self.silhouette=None
        
    def kmeans(self, number):
        """ execute kmeans for n_clusters=numbe r"""
        self.model = KMeans(n_clusters = number)
        self.model.fit(self.dataframe)
        
    def sseTab(self):
        """ return the distorsion in all clusters, distorsion = mean distance with cluster's center """
        sse = []
        for clusters_number in range(1, 30):
            sse.append(0)
            self.kmeans(clusters_number)
            sse[clusters_number-1] += self.model.inertia_           
        return sse
    
    def findNElbow(self):
        """ finds the ideal k through the elbow method """
        sse = self.sseTab()
        pentes = []
        for i in range(1, len(sse)):
            pentes.append(sse[i] - sse[i-1])
        k = -1
        pourcentage = 100
        while k < len(pentes)-1 and pourcentage > 5:
            k+=1
            pourcentage = abs(pentes[k] / sse[0]) * 100
        self.number = k+1 
        
    def findElbow2(self):
        """ method to find k through the elbow method with slopes"""
        sse = self.sseTab()
        pentes = []
        for i in range(1, len(sse)):
            pentes.append(sse[i] - sse[i-1])
            
        k=1
        ang=180
        while(k<len(pentes)-1 and (abs(180-ang*180/np.pi)>110)):
            ang=abs(np.arctan(pentes[0])-np.arctan(pentes[k]))
            k+=1
            
        self.number=k
            
        
    
    def findN(self):
        """ finds the ideal k through the silhouette metric : https://en.wikipedia.org/wiki/Silhouette_(clustering) """        
        res=np.arange(20,dtype='double')
        for k in np.arange(2,22):
            self.kmeans(k)
            res[k-2]=metrics.silhouette_score(self.dataframe,self.model.labels_)   
        self.number=np.argmax(res[2:])+4 #first values not pertinent
        self.silhouette=res
        
    def newDataFrame(self):
        """ adds the column category in the dataframe with the labels of the clusters """
        self.dataframe['category'] = pd.Series(self.model.labels_, index = self.dataframe.index)
        
    def result(self):
        """ return the coordinates of cluster's centers and the dataframe"""
        #self.findN()
        self.findElbow2()
        self.kmeans(self.number)
        self.newDataFrame()
        return self.model.cluster_centers_, self.dataframe
    


#test = pd.read_csv("../../fruitsModified.csv")
#del test["Unnamed: 0"]
test=pd.read_csv("./bibliothq.csv")
test.index=test["livres"]
del test["livres"]
#print(test)
pafkmeans = PafKmeans(test)
centers, data = pafkmeans.result()


# La courbe en "coude"
"""
centers, data = pafkmeans.result()
sse = pafkmeans.sseTab()
plt.plot(np.arange(1, 30), sse, 'ro')
plt.xlabel("k")
plt.ylabel("distorsion") #distorsion = Somme variance (distance avec centre cluster)
plt.show()


#Visualisation des clusters formés par K-Means
plt.scatter(data.r,data.fibres,c=pafkmeans.model.labels_.astype(np.float),edgecolor='k')
plt.title('Classification K-means ')
plt.xlabel("teintes")
plt.ylabel("fibres")
plt.show()

plt.scatter(data.r,data.longueur,c=pafkmeans.model.labels_.astype(np.float),edgecolor='k')
plt.xlabel("teintes")
plt.ylabel("longueur")
plt.show()

#cluster en 3D
fig=plt.figure(1,figsize=(4,3))
ax=Axes3D(fig,rect=[0,0,0.95,1],elev=48,azim=134)
ax.scatter(data.r, data.v, data.b,c=pafkmeans.model.labels_.astype(np.float), edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

plt.title("Classification k-mean 3D")

ax.dist = 12
plt.show()
"""

""" Tracé de la métrique silhouette : plus on est proche de 1, plus le nombre de clusters est ok"""
#res=pafkmeans.silhouette
#print("res = ",res)
print("k =",pafkmeans.number)
#plt.plot(np.arange(2, 22,1), res, 'ro')
#plt.xlabel("k")
#plt.ylabel("Score sur 1")
#plt.show()

k = 27


In [8]:
data.sort_values(by = 'category')

livres
lord_peter_et_le_bellona_club - sayers                                          0
le_joueur - dostoievski                                                         0
poemes_saturniens - verlaine                                                    1
varlet-epopee_martienne_1_titans_du_ciel - joncquel                             1
mettrie_systeme_d_epicure - la                                                  1
la_musique_francaise_1 - landormy                                               1
rochefoucauld_maximes - la                                                      1
banville_le_chat - de                                                           1
walden_ou_la_vie_dans_les_bois - thoreau                                        1
bruyere_caracteres - la                                                         1
droit_paresse - lafargue                                                        1
varlet-epopee_martienne_2_agonie_de_la_terre - joncquel                         1
enfer - b