In [59]:
# Import libraries générales
import pandas as pd
import numpy as np

from random import sample
from collections import Counter, defaultdict
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score


import seaborn as sns
from umap import UMAP
import matplotlib.pyplot as plt

In [2]:
# Import data
data_path = "../data/"
df = pd.read_pickle(data_path+"df_2017_avec_auteurs.pkl")

In [3]:
# Choix du corpus
dfh = df[df.rubrique.apply(lambda x : x in set(df.rubrique.head(5)))]
dfhi = dfh.reset_index(drop = True)
corpus = dfh.question
len(corpus)

1158

In [4]:
# Rubriques correspondantes sous forme d'entier
l_r = list(dfh.rubrique.unique())
dic_r = {l_r[i] : i for i in range(len(l_r))}
rubint = dfh.rubrique.apply(lambda x : dic_r[x])

In [5]:
# Groupes correspondants sous forme d'entier
l_g = list(dfh.groupe_auteur.unique())
dic_g = {l_g[i] : i for i in range(len(l_g))}
groupint = dfh.groupe_auteur.apply(lambda x : dic_g[x])

# 1. Vecteurs

## 1.0. TF, TFIDF

In [7]:
# Import for Vectorizer
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
stop_words = get_stop_words('french')

In [8]:
# Calcul du TF-IF (pour brut, lsa, nmf ...)
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_vectors = tfidf.toarray()
tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names())

In [9]:
# Calcul du TF (pour LDA)
tf_vectorizer = CountVectorizer(stop_words=stop_words)
tf = tf_vectorizer.fit_transform(corpus)
tf_vectors = tf.toarray()
tf_feature_names = np.array(tf_vectorizer.get_feature_names())

## 1.1 ACP

In [11]:
# Import for PCA
from sklearn.decomposition import PCA

In [12]:
# PCA TF IDF
n_components = 10
pca = PCA(n_components=n_components, random_state=0)
reduced_features_tfidf = pca.fit_transform(tfidf_vectors)

In [13]:
# PCA TF
random_state = 0
dim_pca_tf = 10
pca = pca = PCA(n_components=dim_pca_tf, random_state=0)
reduced_features_tf = pca.fit_transform(tf_vectors)

## 1.2. LSA

In [14]:
from sklearn.decomposition import TruncatedSVD

In [15]:
dim_lsa = 10 # Pas bon du tout, tenter de voir les meilleures valeurs singulières
svd_model = TruncatedSVD(n_components=dim_lsa, algorithm='randomized', n_iter=100, random_state=122)
lsa_doc_vectors = svd_model.fit_transform(tfidf)

In [16]:
lsa_word_vectors = svd_model.fit(tfidf).components_

In [17]:
top10words_lsa = np.array([tfidf_feature_names[(-i).argsort()[:10]] for i in lsa_word_vectors])

## 1.3. LDA

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

In [19]:
n_topics_lda = 60
lda_doc_model = LatentDirichletAllocation(n_components=n_topics_lda, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf.T)
lda_doc_vectors = lda_doc_model.components_.T

In [20]:
lda_word_model = LatentDirichletAllocation(n_components=n_topics_lda, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tfidf)
lda_word_vectors = lda_word_model.components_

In [21]:
top10words_lda = np.array([tfidf_feature_names[(-i).argsort()[:10]] for i in lda_word_vectors])

## 1.4. NMF

In [22]:
from sklearn.decomposition import NMF

In [23]:
n_topics_nmf = 100
nmf_doc_model = NMF(n_components=n_topics_nmf, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf.T)
nmf_doc_vectors = nmf_doc_model.components_.T

In [24]:
nmf_word_model = NMF(n_components=n_topics_nmf, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_word_vectors = nmf_word_model.components_

In [25]:
top10words_nmf = np.array([tfidf_feature_names[(-i).argsort()[:10]] for i in nmf_word_vectors])

# 2. Clustering

## 2.1 K-means

In [33]:
# from sklearn.cluster import MiniBatchKMeans

In [34]:
# # Exécution du k-means
# def go_kmeans(vectors, k = 5):
#     random_state = 0
#     kmeans = MiniBatchKMeans(n_clusters=k, random_state=random_state)
#     kmeans.fit(vectors)
#     return kmeans

# # Get the clusters of a kmeans
# def get_clusters_kmeans(vectors, k=5, kmeans = None):
#     if kmeans is None :
#         kmeans = go_kmeans(vectors, k)
#     return kmeans.predict(vectors)

## 2.2 Hiérarchique

In [35]:
# from sklearn.cluster import AgglomerativeClustering

In [36]:
# # Get the clusters of a HAC
# def get_clusters_HAC(vectors, n_clusters, linkage, affinity):
#     aggc = AgglomerativeClustering(linkage=linkage, affinity=affinity, n_clusters=n_clusters)
#     aggc = aggc.fit(vectors)
#     return aggc.labels_

## 2.3 X-means

In [37]:
# from pyclustering.cluster.xmeans import xmeans
# from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
# from pyclustering.cluster import cluster_visualizer

In [73]:
# # Get the clusters of a xmeans
# def execute_xmeans(vecs,kmin,kmax):
#     amount_initial_centers = kmin
#     initial_centers = kmeans_plusplus_initializer(vecs, amount_initial_centers).initialize()

#     xmeans_instance = xmeans(vecs, initial_centers, kmax)
#     xmeans_instance.process()

#     groupes = xmeans_instance.get_clusters()
#     clusters = pd.Series(
#         {j : i for i,m in enumerate(groupes) for j in m}
#     ).sort_index().values
    
#     return clusters