# Vectorizer 

## Import

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# librairies générales
import pandas as pd
import re
from tabulate import tabulate
import time
import numpy as np
import pickle
import string
import base64
import sys

# librairie affichage
import matplotlib.pyplot as plt
import seaborn as sns

# librairies scikit learn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


# librairies des classifiers utilisés
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# librairies NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from nltk import word_tokenize 

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english')) 

from MyNLPUtilities import *

[nltk_data] Downloading package wordnet to /home/oem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/oem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/oem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/oem/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
news = pd.read_csv("news8904.csv", sep=",")
#news = pd.read_csv("news400.csv", sep=",")

X = news.drop(['ratingName'], axis=1)
Y = pd.DataFrame(news['ratingName'])

# Conversion de True/False en 0/1
Y = Y['ratingName'].astype(int)

# Conversion en numpy.ndarray
X = X.to_numpy()
Y = Y.to_numpy()

## Nettoyeur de text

In [3]:
import re
import string

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

def MyCleanText(X, 
               lowercase=True, # mettre en minuscule
               removestopwords=False, # supprimer les stopwords
               removedigit=False, # supprimer les nombres  
               getstemmer=False, # conserver la racine des termes
               getlemmatisation=True # lematisation des termes 
              ):
    
    sentence=str(X)

    # suppression des caractères spéciaux
    sentence = re.sub(r'[^\w\s]',' ', sentence)
    # suppression de tous les caractères uniques
    sentence = re.sub(r'\s+[a-zA-Z]\s+', ' ', sentence)
    # substitution des espaces multiples par un seul espace
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)

    # decoupage en mots
    tokens = word_tokenize(sentence)
    if lowercase:
          tokens = [token.lower() for token in tokens]

    # suppression ponctuation
    table = str.maketrans('', '', string.punctuation)
    words = [token.translate(table) for token in tokens]

    # suppression des tokens non alphabetique ou numerique
    words = [word for word in words if word.isalnum()]
    
    # suppression des tokens numerique
    if removedigit:
        words = [word for word in words if not word.isdigit()]

    # suppression des stopwords
    if removestopwords:
        words = [word for word in words if not word in stop_words]

    # lemmatisation
    if getlemmatisation:
        lemmatizer=WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word)for word in words]
        

    # racinisation
    if getstemmer:
        ps = PorterStemmer()
        words=[ps.stem(word) for word in words]
        
    sentence= ' '.join(words)
  
    return sentence

[nltk_data] Downloading package wordnet to /home/oem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/oem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/oem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Normaliser de text

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 removestopwords=False, # suppression des stopwords
                 lowercase=True,# passage en minuscule
                 removedigit=False, # supprimer les nombres  
                 getstemmer=False,# racinisation des termes 
                 getlemmatisation=True # lemmatisation des termes  
                ):
        
        self.lowercase=lowercase
        self.getstemmer=getstemmer
        self.removestopwords=removestopwords
        self.getlemmatisation=getlemmatisation
        self.removedigit=removedigit

    def transform(self, X, **transform_params):
        # Nettoyage du texte
        X=X.copy() # pour conserver le fichier d'origine
        return [MyCleanText(text,lowercase=self.lowercase,
                            getstemmer=self.getstemmer,
                            removestopwords=self.removestopwords,
                            getlemmatisation=self.getlemmatisation,
                            removedigit=self.removedigit) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self
    
    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X).transform(X)

    def get_params(self, deep=True):
        return {
            'lowercase':self.lowercase,
            'getstemmer':self.getstemmer,
            'removestopwords':self.removestopwords,
            'getlemmatisation':self.getlemmatisation,
            'removedigit':self.removedigit
        }    
    
    def set_params (self, **parameters):
        for parameter, value in parameters.items():
            setattr(self,parameter,value)
        return self  

## Vectorisation

In [5]:
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer 

In [18]:
vectorizer = TfidfVectorizer(   input='content', 
                                encoding='utf-8', 
                                decode_error='strict', 
                                strip_accents=None, 
                                lowercase=True, 
                                preprocessor=None, 
                                tokenizer=None, 
                                analyzer='word',
                                ngram_range=(1, 1), 
                                max_df=1.0, 
                                min_df=1, 
                                max_features=None, 
                                binary=False, 
                                norm='l2', 
                                use_idf=True, 
                                smooth_idf=True, 
                                sublinear_tf=False)

In [8]:
vectorizer = CountVectorizer(   input='content', 
                                encoding='utf-8', 
                                decode_error='strict', 
                                strip_accents=None, 
                                lowercase=True, 
                                preprocessor=None, 
                                tokenizer=None, 
                                stop_words=None, 
                                token_pattern='(?u)\b\w\w+\b', 
                                ngram_range=(1, 1), 
                                analyzer='word', 
                                max_df=1.0, 
                                min_df=1, 
                                max_features=None, 
                                vocabulary=None, 
                                binary=False)

#vectorizer = CountVectorizer(stop_words='english', max_features=1500, analyzer='char')

In [19]:
texte = news['text']

print ("Application de TfidfVectorizer :")
#vectorizer = TfidfVectorizer()
# fit et transform en une opération
X = vectorizer.fit_transform(texte)

# creation du dataframe pour affichage
df = pd.DataFrame(
    data=vectorizer.transform(texte).toarray(),
    columns=vectorizer.get_feature_names()
)

#display(df)
X = df

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=Y) 
fig.show()

Application de TfidfVectorizer :


## Tester plusieur models 

In [20]:
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import xgboost 

In [21]:
from sklearn.svm import SVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
import xgboost 

#news = pd.read_csv("news8904.csv", sep=",")
news = pd.read_csv("news400.csv", sep=",")

X = news.drop(['ratingName'], axis=1)
Y = pd.DataFrame(news['ratingName'])

# Conversion de True/False en 0/1
Y = Y['ratingName'].astype(int)

# Conversion en numpy.ndarray
X = X.to_numpy()
Y = Y.to_numpy()

# Création d'un jeu d'apprentissage et de test
trainsize=0.7 # 70% pour le jeu d'apprentissage, il reste 30% du jeu de données pour le test

testsize= 0.3
seed=30
X_train,X_test,Y_train,Y_test=train_test_split(X, 
                                               Y, 
                                               train_size=trainsize, 
                                               random_state=seed,
                                               test_size=testsize)

In [22]:
models = [DummyClassifier,  
          DecisionTreeClassifier, 
          KNeighborsClassifier,  
          SVC, 
          RandomForestClassifier,
          xgboost.XGBClassifier,
          LogisticRegression]

In [23]:
pipes = []

normalizer = TextNormalizer()

for model in models: 
  pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", vectorizer), 
      ("classifier", model())]))

In [24]:
for pipe in pipes: 
  pipe.fit(X_train,Y_train)
  y_pred = pipe.predict(X_test)
  print(str(pipe["classifier"]) + " " + getAccuracy(Y_test,y_pred)) 
  #MyshowAccuracy(Y_test,y_pred)
  #MyshowAllScores(Y_test,y_pred)

DummyClassifier() Accuracy : 0.433
DecisionTreeClassifier() Accuracy : 0.525
KNeighborsClassifier() Accuracy : 0.592
SVC() Accuracy : 0.600
RandomForestClassifier() Accuracy : 0.625
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.625
LogisticRegression() Accuracy : 0.592


## Résultats

pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=10)), 
      ("classifier", model())]))

DummyClassifier() Accuracy : 0.433pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=1500, ngram_range=(1, 5))), 
      ("classifier", model())]))
DecisionTreeClassifier() Accuracy : 0.550
KNeighborsClassifier() Accuracy : 0.558
SVC() Accuracy : 0.558
RandomForestClassifier() Accuracy : 0.542
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.575
LogisticRegression() Accuracy : 0.575

pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=3000)), 
      ("classifier", model())]))

DummyClassifier() Accuracy : 0.433
DecisionTreeClassifier() Accuracy : 0.575
KNeighborsClassifier() Accuracy : 0.533
SVC() Accuracy : 0.567
RandomForestClassifier() Accuracy : 0.633
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.583
LogisticRegression() Accuracy : 0.583

pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=1500, ngram_range=(1, 4))), 
      ("classifier", model())]))

DummyClassifier() Accuracy : 0.433
DecisionTreeClassifier() Accuracy : 0.600
KNeighborsClassifier() Accuracy : 0.583
SVC() Accuracy : 0.575
RandomForestClassifier() Accuracy : 0.625
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.617
LogisticRegression() Accuracy : 0.600

pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=1500, ngram_range=(1, 5))), 
      ("classifier", model())]))

DummyClassifier() Accuracy : 0.433
DecisionTreeClassifier() Accuracy : 0.592
KNeighborsClassifier() Accuracy : 0.575
SVC() Accuracy : 0.583
RandomForestClassifier() Accuracy : 0.633
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.617
LogisticRegression() Accuracy : 0.600

pipes.append(Pipeline([
      ("cleaner", normalizer), 
      ("count_vectorizer", CountVectorizer(stop_words='english', max_features=1500, ngram_range=(2, 6))), 
      ("classifier", model())]))

DummyClassifier() Accuracy : 0.433
DecisionTreeClassifier() Accuracy : 0.642
KNeighborsClassifier() Accuracy : 0.575
SVC() Accuracy : 0.608
RandomForestClassifier() Accuracy : 0.642
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...) Accuracy : 0.600
LogisticRegression() Accuracy : 0.592