## 0. Imports + Load Datas

In [1]:
## Imports

### Math
import numpy as np
import pandas as pd

### sklearn
#### Transformation
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
#### Models
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, FeatureAgglomeration
from sklearn.linear_model import LogisticRegression
from sklearn import svm
#### Model Optim
from sklearn.model_selection import GridSearchCV
#### Evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.metrics import davies_bouldin_score
#### Pre-processing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

### Viz
import  matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## nltk
import nltk
nltk.download('stopwords') ## à executer qu'une fois
from nltk.corpus import stopwords as sw
stopwords = sw.words("english")

### Autres
from scipy.stats import mode
from time import time
from collections import defaultdict

#DATAS
df = pd.read_csv("./Datas/Emotion_final.csv")
corpus = df.Text
le = preprocessing.LabelEncoder()
targets = le.fit_transform(df.Emotion)
nb_clusters = df.Emotion.nunique()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def run_models(models, splits, corpus, targets, test_size=0.2, seed=42):  
    res = defaultdict(list)
    spliter = ShuffleSplit(n_splits=splits, test_size=test_size, random_state=seed)
    for idx_train, idx_test in spliter.split(corpus, targets):
        for model in models:
            # name of the model
            name = "-".join([x[0] for x in model.steps])
            
            # extract datasets
            X_train = corpus[idx_train]
            X_test = corpus[idx_test]
            y_train = targets[idx_train]
            y_test = targets[idx_test]
            
            # Learn and save compute time
            start_time = time()
            model.fit(X_train, y_train)
            fit_time = time() - start_time
            
            # predict and save results
            y = model.predict(X_test)
            
            res[name].append([
                fit_time,
                model[-1].inertia_,
                metrics.v_measure_score(y_test, y),
                metrics.adjusted_rand_score(y_test, y),
                metrics.adjusted_mutual_info_score(y_test, y),
                metrics.normalized_mutual_info_score(y_test, y),
                metrics.silhouette_score(model.transform(X_test), y, metric="euclidean", sample_size=300),
                metrics.davies_bouldin_score(model.transform(X_test), y),
                metrics.calinski_harabasz_score(model.transform(X_test), y)
                
            ])


    return res


In [3]:
def print_results(res):
    # Compute mean and std
    final = {}
    for model in res:
        arr = np.array(res[model])
        final[model] = {
            "time (s)" : arr[:, 0].mean(),#.round(2),
            "inertia": [arr[:,1].mean().round(3), arr[:,1].std().round(3)],
            "v_meas": [arr[:,2].mean().round(3), arr[:,2].std().round(3)],
            "AR": [arr[:,3].mean().round(3), arr[:,3].std().round(3)],
            "AMI": [arr[:,4].mean().round(3), arr[:,4].std().round(3)],
            "NMI": [arr[:,5].mean().round(3), arr[:,5].std().round(3)],
            "silhouette": [arr[:,6].mean().round(3), arr[:,6].std().round(3)],
            "DBS": [arr[:,7].mean().round(3), arr[:,7].std().round(3)],
            "CHS": [arr[:,8].mean().round(3), arr[:,8].std().round(3)],
        }

    df = pd.DataFrame.from_dict(final, orient="index").round(3)
    return df

In [4]:
#pca = PCA(n_components=10).fit(digits.data)

pipe1 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('kmeans', KMeans(init="k-means++", n_clusters = nb_clusters, n_init=10)),
])

pipe2 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('kmeans', KMeans(init="k-means++", n_clusters = nb_clusters, n_init=10)),
])

In [5]:
# run base pipes
res = run_models([pipe1, pipe2], splits=5, corpus=corpus, targets=targets)
print_results(res)

Unnamed: 0,time (s),inertia,v_meas,AR,AMI,NMI,silhouette,DBS,CHS
vect_stop-kmeans,2.126,"[148180.384, 285.179]","[0.006, 0.003]","[0.004, 0.006]","[0.004, 0.003]","[0.006, 0.003]","[-0.062, 0.016]","[4.591, 0.785]","[68.046, 7.783]"
vect_stop-tfidf-kmeans,4.192,"[16716.045, 4.083]","[0.006, 0.001]","[0.006, 0.005]","[0.004, 0.001]","[0.006, 0.001]","[0.36, 0.032]","[1.017, 0.207]","[1097.999, 803.051]"


In [6]:
#pca = PCA(n_components=10).fit(digits.data)

pipe1 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('kmeans', KMeans(init="random", n_clusters = nb_clusters, n_init=10)),
])

pipe2 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('kmeans', KMeans(init="random", n_clusters = nb_clusters, n_init=10)),
])
res = run_models([pipe1, pipe2], splits=5, corpus=corpus, targets=targets)
print_results(res)

Unnamed: 0,time (s),inertia,v_meas,AR,AMI,NMI,silhouette,DBS,CHS
vect_stop-kmeans,2.036,"[148707.291, 562.321]","[0.006, 0.001]","[0.005, 0.002]","[0.004, 0.001]","[0.006, 0.001]","[-0.035, 0.074]","[4.394, 0.592]","[89.046, 8.744]"
vect_stop-tfidf-kmeans,3.889,"[16722.755, 9.359]","[0.006, 0.001]","[0.004, 0.004]","[0.004, 0.001]","[0.006, 0.001]","[0.421, 0.037]","[0.856, 0.082]","[1033.513, 740.452]"


In [7]:
#pca = PCA(n_components=10).fit(digits.data)

pipe1 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('truncSVD', TruncatedSVD(n_components=50, random_state=42)),
    ('kmeans', KMeans(init="k-means++", n_clusters = nb_clusters, n_init=10)),
])

pipe2 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('truncSVD', TruncatedSVD(n_components=50, random_state=42)),
    ('kmeans', KMeans(init="k-means++", n_clusters = nb_clusters, n_init=10)),
])
# run base pipes
res = run_models([pipe1, pipe2], splits=5, corpus=corpus, targets=targets)
print_results(res)

Unnamed: 0,time (s),inertia,v_meas,AR,AMI,NMI,silhouette,DBS,CHS
vect_stop-truncSVD-kmeans,1.767,"[27777.364, 181.171]","[0.009, 0.001]","[0.011, 0.003]","[0.007, 0.001]","[0.009, 0.001]","[0.168, 0.029]","[1.866, 0.135]","[410.592, 14.185]"
vect_stop-tfidf-truncSVD-kmeans,1.978,"[1522.012, 7.769]","[0.006, 0.002]","[0.003, 0.004]","[0.004, 0.002]","[0.006, 0.002]","[-0.009, 0.019]","[3.257, 0.083]","[118.581, 8.138]"


In [8]:
X_train, X_test, y_train, y_test = train_test_split(corpus, targets, test_size=0.2, random_state=42)
vect = CountVectorizer(stop_words=stopwords)
X = vect.fit_transform(X_train)
trunSVD = TruncatedSVD(n_components=nb_clusters).fit(X)

pipe1 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('kmeans', KMeans(init=trunSVD.components_, n_clusters = nb_clusters, n_init=10)),
])

pipe2 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('kmeans', KMeans(init=trunSVD.components_, n_clusters = nb_clusters, n_init=10)),
])
# run base pipes
res = run_models([pipe1, pipe2], splits=1, corpus=corpus, targets=targets)
print_results(res)

  self._check_params(X)
  self._check_params(X)


Unnamed: 0,time (s),inertia,v_meas,AR,AMI,NMI,silhouette,DBS,CHS
vect_stop-kmeans,0.493,"[147119.768, 0.0]","[0.009, 0.0]","[0.008, 0.0]","[0.007, 0.0]","[0.009, 0.0]","[-0.114, 0.0]","[4.901, 0.0]","[75.094, 0.0]"
vect_stop-tfidf-kmeans,0.492,"[16716.508, 0.0]","[0.008, 0.0]","[0.006, 0.0]","[0.006, 0.0]","[0.008, 0.0]","[0.309, 0.0]","[1.392, 0.0]","[510.076, 0.0]"


Pas d'amélioration notable en donnant l'init du kmeans à partir de la réduction de features.

In [9]:
pipe1 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('truncSVD', TruncatedSVD(n_components=50, random_state=42)),
    ('Spetral_clu', SpectralClustering(n_clusters = nb_clusters, n_init=10)),
])

pipe2 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('truncSVD', TruncatedSVD(n_components=50, random_state=42)),
    ('Agglo_clu', AgglomerativeClustering(n_clusters = nb_clusters)),
])

pipe3 = Pipeline([
    ('vect_stop', CountVectorizer(stop_words=stopwords)),
    ('truncSVD', TruncatedSVD(n_components=50, random_state=42)),
    ('Feature_agglo', FeatureAgglomeration(n_clusters = nb_clusters)),
])

## run base pipes
#res = run_models([pipe1, pipe2, pipe3], splits=1, corpus=corpus, targets=targets)
#print_results(res)

ne fonctionne pas avec la fonction run_model car pas de methode predict

In [None]:
for model in [pipe1, pipe2, pipe3]:
    # name of the model
    name = "-".join([x[0] for x in model.steps])

    # Learn and save compute time
    start_time = time()
    model.fit(corpus, targets)
    fit_time = time() - start_time

    res[name].append([
        fit_time,
        model[-1].inertia_,
        metrics.v_measure_score(targets, model[-1].labels_),
        metrics.adjusted_rand_score(targets, model[-1].labels_),
        metrics.adjusted_mutual_info_score(targets, model[-1].labels_),
        metrics.normalized_mutual_info_score(targets, model[-1].labels_),
        metrics.silhouette_score(model.transform(X_test), model[-1].labels_, metric="euclidean", sample_size=300),
        metrics.davies_bouldin_score(model.transform(X_test), model[-1].labels_),
        metrics.calinski_harabasz_score(model.transform(X_test), model[-1].labels_)

    ])

