In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import confusion_matrix, make_scorer, precision_score, recall_score, f1_score

import seaborn as sns
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
newsgroups_train = fetch_20newsgroups(remove=("headers", "footers", "quotes"), subset='train')
newsgroups_test = fetch_20newsgroups(remove=("headers", "footers", "quotes"), subset='test')

X_train, y_train = fetch_20newsgroups(remove=("headers", "footers", "quotes"), subset='train', return_X_y= True)
X_test, y_test = fetch_20newsgroups(remove=("headers", "footers", "quotes"), subset='test', return_X_y= True)



#X_train_short = newsgroups_train_short.data #liste de toute les données train
#X_test_short = newsgroups_test_short.data #liste de toute les données text
#y_train_short = newsgroups_train_short.filenames #liste de tous les noms de texte
#y_test_short = newsgroups_test_short.filenames #liste de tous les noms de texte
z = (list(newsgroups_train_short.target_names))
z #show newsgroup categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
TfidV =  TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
TfidV_train = TfidV.fit_transform(X_train)
TfidV_test = TfidV.transform(X_test)
print(TfidV_train.shape)
print(TfidV_test.shape)

df_TfidV_train = pd.DataFrame(TfidV_train.toarray().transpose())

(11314, 39115)
(7532, 39115)


In [6]:
k = len(z)

model_Tfid = KMeans(k, max_iter=100)

model_Tfid.fit(TfidV_train)



In [8]:
def cluster_top_words(M, V, n, t) :#function to create top 't' words in 'n' clusters, from trained model 'M' and vectorizer 'V'
    
    words = dict()
    
    order_centroids = M.cluster_centers_.argsort()[:, ::-1]
    
    terms = V.get_feature_names_out()
    
    for i in range (n):
        words["Cluster %d:" %(i+1)] = []

        for ind in order_centroids[i, :t]:
            words["Cluster %d:" %(i+1)].append(terms[ind])
            
    df = pd.DataFrame.from_dict(words)
    return df



In [9]:
top_10_Tfid = cluster_top_words(model_Tfid, TfidV, k, 10)
top_10_Tfid

Unnamed: 0,Cluster 1:,Cluster 2:,Cluster 3:,Cluster 4:,Cluster 5:,Cluster 6:,Cluster 7:,Cluster 8:,Cluster 9:,Cluster 10:,Cluster 11:,Cluster 12:,Cluster 13:,Cluster 14:,Cluster 15:,Cluster 16:,Cluster 17:,Cluster 18:,Cluster 19:,Cluster 20:
0,space,key,window,car,think,armenian,god,gun,does,drive,people,like,game,thanks,file,windows,israel,dsl,bike,team
1,00,encryption,use,cars,don,armenians,jesus,guns,know,scsi,don,just,games,advance,files,card,israeli,n3jxp,ride,year
2,sale,chip,problem,ax,say,turkish,bible,people,anybody,ide,just,edu,hockey,mail,windows,dos,jews,chastity,bikes,players
3,offer,clipper,program,engine,objective,turks,christ,crime,like,drives,know,com,play,hi,ftp,drivers,arab,cadre,riding,season
4,shipping,keys,using,like,morality,armenia,christians,law,just,controller,like,good,season,know,directory,video,arabs,shameful,helmet,teams
5,condition,escrow,need,dealer,science,turkey,faith,militia,board,disk,did,new,espn,info,zip,driver,israelis,geb,dod,pitching
6,new,government,screen,speed,moral,genocide,believe,right,info,hard,government,use,detroit,email,program,mouse,adam,intellect,motorcycle,good
7,price,secure,like,just,evidence,people,people,firearms,say,floppy,think,don,rangers,looking,format,version,jewish,skepticism,dog,win
8,asking,algorithm,apple,good,just,greek,christian,amendment,mean,bus,time,time,playoffs,does,dos,memory,lebanese,pitt,just,braves
9,10,nsa,color,new,people,azerbaijan,church,weapons,work,hd,right,know,league,address,pub,know,lebanon,surrender,like,league


In [16]:
labels_train = newsgroups_train.target
unique_labels_train, category_sizes_train = np.unique(labels_train, return_counts=True)

cluster_ids_Tfid, cluster_sizes_Tfid = np.unique(model_Tfid.labels_, return_counts=True)



Cluster_size = pd.DataFrame({"True_category_size" : sorted(category_sizes_train), 
                             "model_Tfid_short_Cluster size" : sorted(cluster_sizes_Tfid)} 
                             )

Cluster_size.loc['Total'] = Cluster_size.sum()
Cluster_size

Unnamed: 0,True_category_size,model_Tfid_short_Cluster size
0,377,72
1,465,113
2,480,179
3,546,212
4,564,238
5,578,255
6,584,258
7,585,273
8,590,294
9,591,301


In [24]:
test = model_Tfid.labels_

print(labels_train)
print(model_Tfid.labels_)

[7 4 4 ... 3 1 8]
[ 3 11 13 ... 11  0 11]


In [25]:
np.unique(labels_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [26]:
np.unique(model_Tfid.labels_)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int32)