In [118]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [119]:
data_villes = pd.read_csv('villes.csv', sep=';')
villes_labels = data_villes.iloc[:, 0].values
X_villes = data_villes.iloc[:, 1:13].values

# print(villes_labels)
# print(X_villes)

In [120]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_villes = scaler.fit_transform(X_villes)
pca = PCA(n_components=2)

principal_villes_components = pca.fit_transform(X_villes)
principal_villes_data = pd.DataFrame(data=principal_villes_components, columns=['principal component 1', 'principal component 2'])
final_villes_data = pd.concat([principal_villes_data, data_villes[['ville']]], axis=1)

# print(final_villes_data)

In [121]:
import	matplotlib

figure = plt.figure(figsize = (8,8))

ax = figure.add_subplot(1,1,1)

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)

ax.set_title('2 component PCA', fontsize = 20)

for label in final_villes_data['ville']:
    indicesToKeep = final_villes_data['ville'] == label
    ax.scatter(final_villes_data.loc[indicesToKeep, 'principal component 1']
               , final_villes_data.loc[indicesToKeep, 'principal component 2']
               , s = 50)
    
ax.legend(final_villes_data['ville'])
ax.grid()

In [122]:
data_crimes = pd.read_csv('crimes.csv', sep=';')
crimes_labels = data_crimes.iloc[:, 0].values
X_crimes = data_crimes.iloc[:, 1:13].values

# print(crimes_labels)
# print(X_crimes)

In [123]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_crimes = scaler.fit_transform(X_crimes)

principal_crimes_components = pca.fit_transform(X_crimes)
principal_crimes_data = pd.DataFrame(data=principal_crimes_components, columns=['principal component 1', 'principal component 2'])
final_crimes_data = pd.concat([principal_crimes_data, data_crimes[['Etat ']]], axis=1)

# print(final_crimes_data)

In [124]:
import	matplotlib

figure = plt.figure(figsize = (8,8))

ax = figure.add_subplot(1,1,1)

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)

ax.set_title('2 component PCA', fontsize = 20)

for label in final_crimes_data['Etat ']:
    indicesToKeep = final_crimes_data['Etat '] == label
    ax.scatter(final_crimes_data.loc[indicesToKeep, 'principal component 1']
               , final_crimes_data.loc[indicesToKeep, 'principal component 2']
               , s = 50)
    
ax.legend(final_crimes_data['Etat '])
ax.grid()

In [125]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
clustering = kmeans.fit_predict(principal_villes_components)

colors = ['red','yellow','blue','pink']
plt.scatter(principal_villes_components[:, 0], principal_villes_components[:, 1], c=clustering, cmap=matplotlib.colors.ListedColormap(colors))
for label, x, y in zip(villes_labels, principal_villes_components[:, 0], principal_villes_components[:, 1]):
	plt.annotate(label, xy=(x, y), xytext=(-0.2, 0.2), textcoords='offset points')

# plt.show()

In [126]:
from sklearn.cluster import AgglomerativeClustering

agglomerative_clustering = AgglomerativeClustering(n_clusters=3)
clustering = agglomerative_clustering.fit_predict(principal_villes_components)

colors = ['red','yellow','blue','pink']
plt.scatter(principal_villes_components[:, 0], principal_villes_components[:, 1], c=clustering, cmap=matplotlib.colors.ListedColormap(colors))
for label, x, y in zip(villes_labels, principal_villes_components[:, 0], principal_villes_components[:, 1]):
	plt.annotate(label, xy=(x, y), xytext=(-0.2, 0.2), textcoords='offset points')

# plt.show()

agglomerative_clustering = AgglomerativeClustering(n_clusters=3, linkage='average')
clustering = agglomerative_clustering.fit_predict(principal_villes_components)

plt.scatter(principal_villes_components[:, 0], principal_villes_components[:, 1], c=clustering, cmap=matplotlib.colors.ListedColormap(colors))
for label, x, y in zip(villes_labels, principal_villes_components[:, 0], principal_villes_components[:, 1]):
	plt.annotate(label, xy=(x, y), xytext=(-0.2, 0.2), textcoords='offset points')

# plt.show()

In [136]:
from sklearn import metrics

for i in np.arange(2, 6):
    clustering = KMeans(n_clusters=i).fit_predict(X_villes)
    if i == 2:
        aux = [i, metrics.silhouette_score(X_villes, clustering,metric='euclidean')]
    else:
        aux2 = [i, metrics.silhouette_score(X_villes, clustering,metric='euclidean')]
        if aux2[1] > aux[1]:
            aux = aux2
        
# print("Le meilleur nombre de cluster c'est " + str(aux[0]) + " et la silhouette c'est " + str(aux[1]))