Import the 20 newsgroups text dataset

In [33]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import confusion_matrix, make_scorer, precision_score, recall_score, f1_score
import seaborn as sns
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import os, re
import numpy as np
pd.set_option('display.max_colwidth', 200)


newsgroups_train = fetch_20newsgroups(subset='train')
x, y = fetch_20newsgroups(return_X_y=True) 
from pprint import pprint
z = (list(newsgroups.target_names))
z


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
len(y) #Explore : numbers of articles

11314

In [7]:
print(x[1988]) #Explore : example of an article

From: smithw@col.hp.com (Walter Smith)
Subject: Re: Playoff predictions
Organization: Colorado Springs IT Center
Lines: 41
NNTP-Posting-Host: fajita19.cs.itc.hp.com

OK, I'll join in the fun and give my playoff predictions: 

1st round: 
----------

PITT vs NYI:  PITT in 4.  
WASH vs NJD:  WASH in 6. 

BOS  vs BUF:  BOS  in 5. 
QUE  vs MON:  MON  in 7. 

CHI  vs STL:  CHI in 4. 
DET  vs TOR:  DET in 6. 

VAN  vs WIN:  WIN in 6. 
CAL  vs  LA:  CAL in 5. 

2nd round: 
----------

PITT vs WASH: PITT in 4. 
BOS  vs MON:  BOS  in 6. 

CHI  vs DET:  CHI  in 7. 
WIN  vs CAL:  CAL  in 5. 

3rd round: 
----------

PITT vs BOS:  PITT in 5. 
CHI  vs CAL:  CHI  in 5. 

Finals:
------

PITT vs CHI: PITT in 5. 


Walter




2) Vectorize text to allow clustering by Kmeans which requiere numerical values

First TfidVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

newsgroups_train = fetch_20newsgroups(subset='train')

vectorizer =  TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
vectors = vectorizer.fit_transform(x)
vectors.shape

(11314, 56121)

In [9]:
def clust_metrics(estimator, data, y = None): #function to evaluate clustering model

    from sklearn.metrics import rand_score, adjusted_rand_score
    from sklearn.metrics import ( mutual_info_score, normalized_mutual_info_score, adjusted_mutual_info_score)
    from sklearn.metrics import ( homogeneity_score, completeness_score, v_measure_score)
    from sklearn.metrics import fowlkes_mallows_score
    from sklearn.metrics import silhouette_score
    from sklearn.metrics import calinski_harabasz_score
    from sklearn.metrics import davies_bouldin_score
    import pandas as pd

    estimator.fit(data)

    metrics_GT = [rand_score, adjusted_rand_score, mutual_info_score, normalized_mutual_info_score, 
               adjusted_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, 
               fowlkes_mallows_score]
    
    metrics_unk = [ silhouette_score]

    score = []
    
    if y is not None :
        for n in metrics_GT :
            met = n(y, estimator.labels_)
            score.append(round(met, 2))

        for n in metrics_unk :
            met = n(data, estimator.labels_)
            score.append(round(met, 2))
    
        return pd.DataFrame({"Clustering Metrics": score}, index=['Rand_Index', 'Adjusted_Rand_Index', 'Mutual_Information', 'Normalized_Mutual_Information', 
               'Adjusted_Mutual_Information', 'Homogeneity_score', 'Completeness_score', 'V_measure_score', 
               'Fowlkes_Mallows_score','Silhouette_score'])
    
    else :
        for n in metrics_unk :
            met = n(data, estimator.labels_)
            score.append(round(met, 2))

        return pd.DataFrame({"Clustering Metrics": score}, index=['Silhouette_score'])

3) Training Kmeans clustering model, and evaluate Clustering

In [11]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans


model_Tfid = KMeans(20,n_init= 10).fit(vectors)

Kmeans_score_Tfid = clust_metrics(model_Tfid, vectors, y)

Clustering score is pretty bad. Very poor Silhouette index, meaning that clusters are overlapping. Kmeans seems not to be a valuable method to clusterize text.

See metrics score below, to compare with the next vectorization method.

Let's try with another Vectorization method, 'CountVectorizer' instead of 'TfidfVectorizer'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

newsgroups_train = fetch_20newsgroups(subset='train')

vectorizer =  CountVectorizer()
Cvectors = vectorizer.fit_transform(x)
Cvectors.shape

(11314, 130107)

In [13]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans


model_CountVect = KMeans(20,n_init= 10).fit(Cvectors)

Kmeans_score_CountV  = clust_metrics(model_CountVect , Cvectors, y)

Création d'un tableau Contenant les scores pour chaque méthode de corrélation.

In [32]:
Kmeans_score_Tfid.rename(columns = {'Clustering Metrics':'Tfid Vectorizer'}, inplace = True)




pdToList = list(Kmeans_score_CountV['Clustering Metrics'])


Kmeans_score_Tfid['CounterVect'] = pdToList

Kmeans_score_Tfid



Unnamed: 0,Tfid Vectorizer,CounterVect
Rand_Index,0.83,0.5
Adjusted_Rand_Index,0.14,0.01
Mutual_Information,1.22,0.08
Normalized_Mutual_Information,0.46,0.04
Adjusted_Mutual_Information,0.45,0.03
Homogeneity_score,0.41,0.03
Completeness_score,0.51,0.08
V_measure_score,0.46,0.04
Fowlkes_Mallows_score,0.24,0.17
Silhouette_score,0.01,0.23


Evaluation metrics still very poor. But we can observe a slight increase for some score with the Tfid Vectorization method.
Note that 2 metrics 'Calinski_Harabasz_score' and 'Davies_Bouldin_score' return error on calculation and were remove for the evaluation function.

Next I'm gonna to clean the text from repeated information such as headers and footers informations.