### Importation et exploration du dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
data = fetch_20newsgroups(subset='all')

### Compliqué d'effectuer une analyse descriptive avec le dataset 

In [4]:
print(f"Dimension de data : {len(data.data)}")

Dimension du dataset : 18846


In [5]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Separation du dataset 

In [12]:
from sklearn.model_selection import train_test_split

X_train, x_test, y_train, y_test = train_test_split(data.data, data.target, 
                                                    train_size=1000, test_size=250, 
                                                    shuffle=True, stratify=data.target)

## Vectorisation avec Tf-idf
Objectif : Transformer les mots, ou phrases en vecteurs de nombres qui peuvent être utilisés comme entrées pour mon modèle d'apprentissage automatique

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

### implementation du support vector machin

In [14]:
from sklearn import svm

classif_model_tfidf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)    
classif_model_tfidf.fit(X_train_tfidf,y_train)

SVC(gamma='auto', kernel='linear', probability=True)

In [15]:
y_pred_tfidf = classif_model_tfidf.predict(x_test_tfidf)
y_pred_tfidf

array([13,  1,  6,  6, 13, 12, 17, 12, 14,  5, 13, 17, 10, 17,  4,  1, 18,
       10, 12,  3, 10,  2,  1,  1,  6,  1,  6, 14,  6,  8,  5, 11,  4, 13,
        8,  2,  6, 12,  4,  9,  6, 18,  8,  5,  4, 14, 15,  3, 11, 13, 14,
        4, 12,  6, 17, 16,  5, 19, 15,  0, 13, 15,  8, 13, 12, 16,  7,  9,
        5,  0, 12, 16,  8,  8, 16, 18, 16, 12,  3, 13, 12, 15, 17, 10,  5,
       10,  9, 16,  9,  3,  5,  0, 19,  1, 14,  7,  2, 13,  3,  1, 13,  2,
       11, 16,  6, 12, 17, 12, 13,  4,  8, 14,  1,  9,  3,  0, 12,  3,  7,
       19, 11, 12,  1, 10,  3,  0, 12,  8, 17, 14, 16,  9, 19,  6,  9,  8,
        4,  3,  4,  1, 18, 15,  9,  9, 16, 15,  0, 12,  6, 11, 17,  3,  2,
        6, 15, 11,  2,  4,  3,  1,  0, 15, 17, 14,  5,  5,  0,  7,  6,  9,
        1, 17, 12,  3, 16, 11,  3, 12, 18, 17, 19,  3, 14, 15, 16, 15, 14,
        1,  2, 10, 13,  2,  2,  7, 15,  6,  6,  2,  7, 15,  8,  8, 12,  6,
        6, 11, 14, 13,  7, 15, 15,  8, 17, 14,  2,  8, 13, 15, 17,  1,  1,
        7,  7,  2,  7,  3

In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_tfidf))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75        11
           1       0.80      0.62      0.70        13
           2       0.89      0.62      0.73        13
           3       0.54      0.54      0.54        13
           4       0.77      0.77      0.77        13
           5       0.56      0.69      0.62        13
           6       0.67      0.46      0.55        13
           7       0.69      0.69      0.69        13
           8       0.91      0.77      0.83        13
           9       1.00      0.92      0.96        13
          10       1.00      1.00      1.00        13
          11       0.85      0.85      0.85        13
          12       0.42      0.77      0.54        13
          13       0.71      0.77      0.74        13
          14       0.79      0.85      0.81        13
          15       0.63      0.92      0.75        13
          16       0.50      0.58      0.54        12
          17       0.83    

In [16]:
# Creation d'une fonction de nettoyage des caractéres indésirables 
def clean_text(text):
    clean_text = text.replace('\n','')
    clean_text = clean_text.replace('\t','') 
    return clean_text

X_train_pre = [clean_text(news) for news in X_train]
x_test_pre = [clean_text(news) for news in x_test]

tfidf_vectorizer = TfidfVectorizer()
X_train_pre_tfidf = tfidf_vectorizer.fit_transform(X_train_pre)
x_test_pre_tfidf = tfidf_vectorizer.transform(x_test_pre)

classif_model_pre_tfidf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)    
classif_model_pre_tfidf.fit(X_train_pre_tfidf,y_train)
y_pred_pre_tfidf = classif_model_pre_tfidf.predict(x_test_pre_tfidf)

###  Effectuons une analyse de la décomposition latente de Dirichlet (LDA) sur les données

In [17]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=20)
lda.fit(X_train_tfidf)
y_pred_lda = lda.transform(x_test_tfidf)

### les mots représentatifs de chacun des clusters.

In [18]:
for index,topic in enumerate(lda.components_):
    print(f'10 mots majoritairement représentant le cluster {index} : ')
    print([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

10 mots majoritairement représentant le cluster 0 : 


AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names_out'

In [19]:
import numpy as np
from sklearn.metrics.cluster import adjusted_rand_score

y_pred_clusters = []
for pred in y_pred_lda:
  y_pred_clusters.append(np.argmax(pred))

adjusted_rand_score(y_test, y_pred_clusters)

-5.7937137765773566e-05