# Jeu de données 1 : Reuters

## titre + corps 

In [6]:
# dans ce notebook, on fait la même chose que précédemment, en ajoutant le titre au corps de l'article.

In [None]:
# 0 indique que ce n'est pas un spam
# 1 indique que c'est un spam

## Lecture du jeu de données et comptes

In [1]:
class ReutersSGMLParser():
    """A helper class for parsing Reuters-21578 XGML file formats"""
    def __init__(self):
        self.bad_char_pattern = re.compile(r"&#\d*;")
        self.document_pattern = re.compile(r"<REUTERS.*?<\/REUTERS>", re.S)
        self.date_pattern = re.compile(r'[0-9]+-[A-Z]{3}-[0-9]{4} *[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]+')

    def empty_row(self):
        """Get an empty rows which can be transformed into a dataframe"""
        rows = {
            'old_id'     : [],
            'new_id'     : [],
            'has_topics' : [],
            'date'       : [],
            'topics'     : [],
            'places'     : [],
            'people'     : [],
            'orgs'       : [],
            'exchanges'  : [],
            'companies'  : [],
            'title'      : [],
            'dateline'   : [],
            'body'       : [],
            'author'     : [],
            'cgi_split'  : [],
            'lewis_split': []
        }
        return rows

    def get_text(self, elem, tagname, d_tag = False):
        """Get the text of a tag or empty string"""
        txt = getattr(elem, tagname, '')
        if txt == '':
            return ''
        if d_tag:
            txt = txt.D
        txt = txt.text.strip()
        return txt

    def get_date(self, elem, tagname):
        """Get the datetime of a tag or empty string"""
        date_str = getattr(elem, tagname, '')
        if date_str == '':
            return ''
        date_str = date_str.text.strip()
        try:
            date_str = self.date_pattern.findall(date_str)[0]
        except IndexError as ie:
            print('Cannot find date patter in: %s' % date_str)
            return ''
        date = datetime.strptime(date_str, '%d-%b-%Y %H:%M:%S.%f')
        return date

    def parse_header(self, rows, doc):
        """parse the header.
        e.g. <REUTERS TOPICS="YES" LEWISSPLIT="TRAIN" CGISPLIT="TRAINING-SET" OLDID="5544" NEWID="1">"""
        items = dict(doc.items())
        rows[   'old_id'  ].append(items.get('OLDID', ''))
        rows[   'new_id'  ].append(items.get('NEWID', ''))
        rows[ 'has_topics'].append(bool(items.get('TOPICS', '')))
        rows[ 'cgi_split' ].append(items.get('CGISPLIT', ''))
        rows['lewis_split'].append(items.get('LEWISSPLIT', ''))

    def parse_string(self, str):
        # remove bad characters
        xml_data = self.bad_char_pattern.sub('', str)
        # find documents
        documents = self.document_pattern.findall(xml_data)
        # parse document's elements
        rows = self.empty_row()
        for doc in documents:
            xml_doc = objectify.fromstring(doc)
            # parse attributes of the header
            self.parse_header(rows, xml_doc)
            # read DATE
            rows[  'date'  ].append(self.get_date(xml_doc, 'DATE'))
            # read TOPICS
            rows[  'topics'  ].append(self.get_text(xml_doc,'TOPICS', True))
            # read PLACES
            rows[  'places'  ].append(self.get_text(xml_doc, 'PLACES', True))
            # read PEOPLE
            rows[ 'people'  ].append(self.get_text(xml_doc, 'PEOPLE', True))
            # read ORGS
            rows[ 'orgs'  ].append(self.get_text(xml_doc, 'ORGS', True))
            # read EXCHANGES
            rows[ 'exchanges'  ].append(self.get_text(xml_doc, 'EXCHANGES', True))
            # read COMPANIES
            rows[ 'companies'  ].append(self.get_text(xml_doc, 'COMPANIES', True))
            # read the TEXT tag
            text = xml_doc.TEXT
            rows[ 'title'  ].append(self.get_text(text, 'TITLE'))
            rows['dateline'].append(self.get_text(text, 'DATELINE'))
            rows[  'body'  ].append(self.get_text(text, 'BODY'))
            rows[  'author'  ].append(self.get_text(text, 'AUTHOR'))
        return rows

    def parse(self, path):
        """parse a file from the Reuters dataset
        """
        # open xml file
        xml_data = ''
        try:
            xml_data = open(path, 'r', encoding="utf-8").read()
        except UnicodeDecodeError as ude:
            print('Failed to read %s as utf-8' % path)
            lines = []
            for line in open(path, 'rb').readlines():
                line = line.decode('utf-8','ignore') #.encode("utf-8")
                lines.append(line)
            xml_data = '\n'.join(lines)
        return self.parse_string(xml_data)

In [3]:
import pandas as pd
import re
from lxml import etree
from lxml import objectify
from datetime import datetime

Liste = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

parser = ReutersSGMLParser()
data = parser.empty_row()
for j in Liste :
    for path in  ['Reuters/reuters21578/reut2-0%i.sgm'%j]:
    # parse current document
        rows = parser.parse(path)
    # append rows into dataset
        for key in data.keys():
            data[key] = data[key] + rows[key]

df = pd.DataFrame(data, columns=data.keys())
#df = df.astype(dtype= {"date":"datetime64[]"})
df.head()

#il y a 20578 articles

lieu = []
for i in df.places:
    lieu.append(i)
#print(lieu)
#print(type(lieu))

Failed to read Reuters/reuters21578/reut2-017.sgm as utf-8
Cannot find date patter in: 31-MAR-1987 605:12:19.12


In [10]:
texte = []
for i in df.title:
    for j in df.body:
        tout = i+j
    texte.append(tout)
print(len(texte))
print(type(texte))

21578
<class 'list'>


In [11]:
for j in texte :
    print(j)
    break

BAHIA COCOA REVIEWThe American Stock Exchange said it has
introduced options with expirations of up to three years on the
Institutional Index.
    With the ticker symbol <XII>, the index is a guage of the
core equity holdings of the nation's largest institutions, the
exchange explained.
    The new listings represent the first long-term options to
be traded by the Amex, it added.
    It said the long-term Institutional Index options began
trading Monday with expirations of December 1988 <XIV> and
December 1989 <XIX>.
   
    The Amex said a third long-term option with an expiration
of December 1990 will begin trading following the December 1987
expiration.
    It said strike prices on the long-term options have been
set at 50 point intervals with initial strikes of 250, 300 and
350. To avoid conflicting strike price codes, the 350 stike
prices will carry the ticker symbols <XVV> for the option
expiring in December 1988 and <XVX> for the option expiring in
December 1989.
 Reuter


In [12]:
import nltk
from nltk.tokenize import word_tokenize

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alexj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
import nltk
from nltk.tokenize import word_tokenize
# tokenizer comprenant les mots avec une apostrophe (l', qu') et les ponctuations séparément
# une telle tokenization nous permet d'avoir une bonne idée du nombre de token
nb_instances= 0
for i in texte:
    l=len(word_tokenize(i))
    nb_instances+=l
print("Il y a", nb_instances, " tokens en incluant la ponctuation.") # nombre de tokens avec ponctuation

import re
nb_instances2= 0
for i in texte:
    j = re.sub(",|;|\.", " ", i)
    l2=len(word_tokenize(j))
    nb_instances2 += l2
print("Il y a", nb_instances2, " tokens sans inclure la ponctuation.") # nombre de tokens sans ponctuation

from nltk.tokenize import sent_tokenize
nb_instances3= 0
for i in texte:
    longueur_text=len(sent_tokenize(i))
    nb_instances3 += longueur_text
print("Il y a", nb_instances3, "phrases dans l'ensemble du corpus.")

Il y a 4016941  tokens en incluant la ponctuation.
Il y a 3761395  tokens sans inclure la ponctuation.
Il y a 173110 phrases dans l'ensemble du corpus.


## Vectorisation du jeu de données

In [16]:
## Récupérer les instances (X) et les classes (y) et vectoriser
y = lieu

from sklearn.feature_extraction.text import CountVectorizer
V = CountVectorizer(ngram_range = (1,2) )
X = V.fit_transform(texte)

## séparer train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [17]:
test = 0.3*21578
train = 21578 - test

In [18]:
print("0,3 pour la taille du test soit environ", test, "articles. Donc environ", train, "articles pour le train.")

0,3 pour la taille du test soit environ 6473.4 articles. Donc environ 15104.6 articles pour le train.


## Classifications et Evaluations

### Le perceptron, réseau de neurones simple, classifieur linéaire

In [19]:
from sklearn.linear_model import Perceptron

In [20]:
#classifier

ppn = Perceptron(eta0=0.1, random_state=0)
ppn.fit(X_train, y_train)
y_pred = ppn.predict(X_test)

# On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())

Bons résultats 4285
Erreurs: 2189


**Les données de classification : elles permettent d'évaluer la qualité de la classification. Ici, on calcule ces données avec les résultats donnés par le Perceptron.**

In [21]:
from sklearn.metrics import classification_report

In [22]:

#Classification report permet de msurer l'exactitude d'une clissification selon plusieurs paramètres

report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

                          0.74      0.64      0.69       811
            algeria       0.00      0.00      0.00         2
             angola       0.00      0.00      0.00         0
          argentina       1.00      0.33      0.50        15
              aruba       0.00      0.00      0.00         1
          australia       0.00      0.00      0.00        60
            austria       0.00      0.00      0.00         5
            bahrain       1.00      0.25      0.40         8
         bangladesh       0.83      0.83      0.83         6
            belgium       0.50      0.04      0.07        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.00      0.00      0.00         5
           botswana       0.00      0.00      0.00         1
             brazil       0.62      0.71      0.66        52
              burma       0.00      0.00      0.00         1
           cameroon    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# la précision est la proportion des items pertinents parmi l'ensemble des items proposés 

In [13]:
# le rappel est la proportion des items pertinents proposés parmi l'ensemble des items pertinents. 

In [14]:
# Vrai négatif : absent, absent
# Vrai positif : présent, présent
# Faux négatif : présent(dans la référence), absent(dans l'hypothèse)
# Faux positif : absent, présent

In [15]:
# support : nombre d'instances concernées

# micro f-mesure : moyenne des F-mesure pondérée (une classe compte en fonction de sa taille)

# macro f-mesure : moyenne des F-mesure de chaque classe (indépendamment de leur taille)

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
#La matrice de confusion

matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)

[[521   0  89 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   6 ...   0   0   0]
 [  1   0   1 ...   0   3   0]
 [  0   0   0 ...   0   0   5]]


### Deuxième évaluation : Un arbre de décision

In [25]:
from sklearn import tree

In [26]:
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

In [27]:
# encore une matrice de confusion
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)

print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())

[[514   0   0 ...   2   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
Bons résultats 4836
Erreurs: 1638


In [28]:
from sklearn.metrics import classification_report

In [29]:
#Classification report permet de msurer l'exactitude d'une clissification selon plusieurs paramètres

report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

                          0.69      0.63      0.66       811
            algeria       0.00      0.00      0.00         2
          argentina       0.88      0.47      0.61        15
              aruba       0.00      0.00      0.00         1
          australia       0.69      0.57      0.62        60
            austria       0.00      0.00      0.00         5
            bahrain       0.56      0.62      0.59         8
         bangladesh       0.40      0.33      0.36         6
            belgium       0.52      0.59      0.55        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.14      0.20      0.17         5
           botswana       0.00      0.00      0.00         1
             brazil       0.72      0.65      0.69        52
              burma       0.00      0.00      0.00         1
           cameroon       0.00      0.00      0.00         1
             canada    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
# résultats moins bons que précédemment.

## Approfondissement

**Regardons l'impact du paramètres random_state**

In [30]:
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [31]:
print("Avec la valeur par défaut de random state")
for i in range(3):
  DT = tree.DecisionTreeClassifier()
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print(matrice_confusion)
  stats = precision_recall_fscore_support(y_test, y_pred)
  print(stats)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
  print("--"*10)

Avec la valeur par défaut de random state
[[502   1   0 ...   0   0   0]
 [  0   0   0 ...   0   2   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.6962552 , 0.        , 0.875     , 0.        , 0.61111111,
       0.        , 0.375     , 0.42857143, 0.51724138, 0.        ,
       0.14285714, 0.        , 0.66666667, 0.        , 0.        ,
       0.57327586, 0.        , 0.        , 0.75675676, 0.66666667,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.76470588, 0.        , 0.        , 0.66666667,
       0.62365591, 0.        , 0.11111111, 0.        , 0.        ,
       0.5       , 0.5       , 0.8       , 0.9375    , 0.5       ,
       0.        , 0.        , 0.82608696, 0.        , 0.69369369,
       0.        , 0.        , 0.5       , 0.        , 0.4       ,
       0.        , 0.        , 0.72727273, 0.        , 0.25      ,
       0.        , 0.6111111

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[507   1   0 ...   0   0   0]
 [  0   0   0 ...   0   2   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.70416667, 0.        , 0.77777778, 0.        , 0.64      ,
       0.        , 0.42857143, 0.66666667, 0.53448276, 0.        ,
       0.16666667, 0.        , 0.72      , 0.        , 0.        ,
       0.5990991 , 0.        , 0.        , 0.77777778, 0.75      ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.72222222, 0.        , 0.        , 0.66666667,
       0.64210526, 0.        , 0.125     , 0.        , 0.        ,
       0.33333333, 1.        , 0.72727273, 0.82352941, 0.42857143,
       0.        , 0.        , 0.        , 0.76      , 0.        ,
       0.70319635, 0.        , 0.        , 0.66666667, 0.        ,
       0.5       , 0.        , 0.        , 0.58333333, 0.        ,
       0.5       , 0.        , 0.        , 0.66666667, 0.47826087,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[510   0   0 ...   0   0   0]
 [  0   0   0 ...   0   2   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.68181818, 0.        , 0.875     , 0.        , 0.69565217,
       0.        , 0.2       , 0.4       , 0.54237288, 0.        ,
       0.1       , 0.        , 0.75510204, 0.        , 0.        ,
       0.60775862, 0.        , 0.        , 0.72222222, 0.6       ,
       0.        , 1.        , 0.        , 0.5       , 0.        ,
       0.        , 0.72222222, 0.        , 0.        , 0.57142857,
       0.64835165, 0.        , 0.5       , 0.        , 0.        ,
       0.44444444, 0.5       , 0.72727273, 0.9375    , 0.42857143,
       0.        , 0.        , 0.        , 0.82608696, 0.        ,
       0.68695652, 0.        , 0.        , 0.5       , 0.        ,
       0.5       , 0.        , 0.        , 0.61538462, 0.        ,
       0.25      , 0.        , 0.6875    , 0.6       , 0.        ,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
print("En fixant random state")
for i in range(3):
  DT = tree.DecisionTreeClassifier(random_state=0)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print(matrice_confusion)
  stats = precision_recall_fscore_support(y_test, y_pred)
  print(stats)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))


En fixant random state
[[508   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.70653686, 0.        , 0.875     , 0.        , 0.60344828,
       0.        , 0.38461538, 0.4       , 0.52631579, 0.        ,
       0.16666667, 0.        , 0.825     , 0.        , 0.        ,
       0.60619469, 0.        , 0.        , 0.70731707, 0.44444444,
       0.        , 1.        , 0.        , 0.5       , 0.        ,
       0.        , 0.72222222, 0.        , 0.        , 0.8       ,
       0.61702128, 0.        , 0.33333333, 0.        , 0.        ,
       0.45833333, 1.        , 0.66666667, 0.88235294, 0.5       ,
       0.        , 0.        , 0.        , 0.86956522, 0.        ,
       0.69642857, 0.        , 0.5       , 0.66666667, 0.        ,
       0.25      , 0.        , 0.        , 0.8       , 0.        ,
       0.33333333, 0.        , 0.73333333, 0.42

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[508   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.70653686, 0.        , 0.875     , 0.        , 0.60344828,
       0.        , 0.38461538, 0.4       , 0.52631579, 0.        ,
       0.16666667, 0.        , 0.825     , 0.        , 0.        ,
       0.60619469, 0.        , 0.        , 0.70731707, 0.44444444,
       0.        , 1.        , 0.        , 0.5       , 0.        ,
       0.        , 0.72222222, 0.        , 0.        , 0.8       ,
       0.61702128, 0.        , 0.33333333, 0.        , 0.        ,
       0.45833333, 1.        , 0.66666667, 0.88235294, 0.5       ,
       0.        , 0.        , 0.        , 0.86956522, 0.        ,
       0.69642857, 0.        , 0.5       , 0.66666667, 0.        ,
       0.25      , 0.        , 0.        , 0.8       , 0.        ,
       0.33333333, 0.        , 0.73333333, 0.42857143, 0.        ,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[508   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   1]]
(array([0.70653686, 0.        , 0.875     , 0.        , 0.60344828,
       0.        , 0.38461538, 0.4       , 0.52631579, 0.        ,
       0.16666667, 0.        , 0.825     , 0.        , 0.        ,
       0.60619469, 0.        , 0.        , 0.70731707, 0.44444444,
       0.        , 1.        , 0.        , 0.5       , 0.        ,
       0.        , 0.72222222, 0.        , 0.        , 0.8       ,
       0.61702128, 0.        , 0.33333333, 0.        , 0.        ,
       0.45833333, 1.        , 0.66666667, 0.88235294, 0.5       ,
       0.        , 0.        , 0.        , 0.86956522, 0.        ,
       0.69642857, 0.        , 0.5       , 0.66666667, 0.        ,
       0.25      , 0.        , 0.        , 0.8       , 0.        ,
       0.33333333, 0.        , 0.73333333, 0.42857143, 0.        ,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Différents paramètres que l'on peut faire varier : max_depth, min_samples_split, min_samples_leaf et max_features**

In [33]:
print("On teste max_depth")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste max_depth
Avec max_depth= 1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bons résultats 3690
Erreurs: 2784
Accuracy: 0.57
Avec max_depth= 2
[[202   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3750
Erreurs: 2724
Accuracy: 0.58
Avec max_depth= 3
[[204   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3825
Erreurs: 2649
Accuracy: 0.59
Avec max_depth= 4
[[270   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3911
Erreurs: 2563
Accuracy: 0.60
Avec max_depth= 5

KeyboardInterrupt: 

Meilleur résultat obtenu :
Avec max_depth= 96
[[1265   49]
 [  30  375]]
(array([0.97683398, 0.88443396]), array([0.96270928, 0.92592593]), array([0.9697202 , 0.90470446]), array([1314,  405]))
Bons résultats 1640
Erreurs: 79

In [34]:
print("On teste min_samples_split:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste min_samples_split:
Avec max_depth= 1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bons résultats 3690
Erreurs: 2784
Accuracy: 0.57
Avec max_depth= 2
[[202   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3750
Erreurs: 2724
Accuracy: 0.58
Avec max_depth= 3
[[204   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3825
Erreurs: 2649
Accuracy: 0.59
Avec max_depth= 4
[[270   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3910
Erreurs: 2564
Accuracy: 0.60
Avec max

In [35]:
print("On teste min_samples_leaf:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste min_samples_leaf:
Avec max_depth= 1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bons résultats 3690
Erreurs: 2784
Accuracy: 0.57
Avec max_depth= 2
[[202   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3750
Erreurs: 2724
Accuracy: 0.58
Avec max_depth= 3
[[204   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3825
Erreurs: 2649
Accuracy: 0.59
Avec max_depth= 4
[[270   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3911
Erreurs: 2563
Accuracy: 0.60
Avec max_

In [36]:
print("On teste max_features:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste max_features:
Avec max_depth= 1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Bons résultats 3690
Erreurs: 2784
Accuracy: 0.57
Avec max_depth= 2
[[202   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3750
Erreurs: 2724
Accuracy: 0.58
Avec max_depth= 3
[[204   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3825
Erreurs: 2649
Accuracy: 0.59
Avec max_depth= 4
[[270   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 3910
Erreurs: 2564
Accuracy: 0.60
Avec max_dept

In [None]:
# pour ces trois derniers paramètres on ne passe jamais sous la barre des 85 erreurs.

### Autres caracétristiques

In [37]:
import statistics
from nltk.tokenize import sent_tokenize

nb_phrases= 0
for i in texte:
    longueur_text=len(sent_tokenize(i))
    nb_phrases += longueur_text
    
X_stylo = []#notre nouvelle matrice de description
for text in texte:
    mots=word_tokenize(text)
    phrases = sent_tokenize(text)
    NB_mots = len(mots)
    NB_caracteres = len(text)
    for x in mots :
        moyenne_taille_mots = statistics.mean([len(x)])
    moyenne_taille_phrases = NB_mots/nb_phrases
    caracteristiques = [nb_phrases, NB_mots, NB_caracteres, moyenne_taille_mots, moyenne_taille_phrases]
    X_stylo.append(caracteristiques)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_stylo, y, test_size=0.3, random_state=0)
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)
stats = precision_recall_fscore_support(y_test, y_pred)
print(stats)
report = classification_report(y_test, y_pred)
print(report)
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[[567   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
(array([0.77038043, 0.        , 0.        , 0.        , 0.16666667,
       0.        , 0.        , 0.        , 0.33333333, 0.        ,
       0.        , 0.        , 0.22222222, 0.        , 0.        ,
       0.18181818, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.14427861, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# taux d'exactitude très mauvais puisqu'on fait moins que le hasard.

**Et maintenant on combine le BOW et le stylométrique**

In [39]:
## on regarde la "forme" de X
print(X.shape[0])#NB lignes   -> instances
print(X.shape[1])#Nb colonnes -> caractéristiques

##on crée une sparse matrix avec notre X_stylo
from scipy.sparse import csr_matrix
sparse_stylo = csr_matrix(X_stylo)
print(sparse_stylo.shape[0])#NB lignes   -> instances
print(sparse_stylo.shape[1])#Nb colonnes -> caractéristiques

## on a le même nombre de ligne, on fait donc une conctaténation horizontale :
from scipy.sparse import hstack
X_fusion = hstack((X, sparse_stylo))

21578
93417
21578
5


**Résultats:**

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_fusion, y, test_size=0.3, random_state=0)
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)
report = classification_report(y_test, y_pred)
print(report)
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[[643   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   8 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  1   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   3]]
                     precision    recall  f1-score   support

                          0.79      0.79      0.79       811
            algeria       0.00      0.00      0.00         2
          argentina       0.89      0.53      0.67        15
              aruba       0.00      0.00      0.00         1
          australia       0.42      0.43      0.43        60
            austria       0.00      0.00      0.00         5
            bahrain       0.00      0.00      0.00         8
         bangladesh       0.33      0.17      0.22         6
            belgium       0.43      0.38      0.40        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.17      0.20      0.18         5
           botswana       0.00      0.00      0.00         1
             b

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#résultats moyens

## Autres classifieurs

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings 
warnings.filterwarnings("ignore")

LR = LogisticRegression()
RDF = RandomForestClassifier()
SVC = SVC()
KN = KNeighborsClassifier()

In [42]:
liste = [LR, RDF, SVC, KN]

for i in liste:
    i = i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    matrice_confusion = confusion_matrix(y_test, y_pred)
    print(i)
    print(matrice_confusion)
    report = classification_report(y_test, y_pred)
    print(report)
    print('Bons résultats %d' % (y_test == y_pred).sum())
    print('Erreurs: %d' % (y_test != y_pred).sum())
    print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print("-"*10)

LogisticRegression()
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
                     precision    recall  f1-score   support

                          0.00      0.00      0.00       811
            algeria       0.00      0.00      0.00         2
          argentina       0.00      0.00      0.00        15
              aruba       0.00      0.00      0.00         1
          australia       0.00      0.00      0.00        60
            austria       0.00      0.00      0.00         5
            bahrain       0.00      0.00      0.00         8
         bangladesh       0.00      0.00      0.00         6
            belgium       0.00      0.00      0.00        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.00      0.00      0.00         5
           botswana       0.00      0.00      0.00         1
             brazil       0.00      0.00      0.00        52
    

SVC()
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
                     precision    recall  f1-score   support

                          0.00      0.00      0.00       811
            algeria       0.00      0.00      0.00         2
          argentina       0.00      0.00      0.00        15
              aruba       0.00      0.00      0.00         1
          australia       0.00      0.00      0.00        60
            austria       0.00      0.00      0.00         5
            bahrain       0.00      0.00      0.00         8
         bangladesh       0.00      0.00      0.00         6
            belgium       0.00      0.00      0.00        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.00      0.00      0.00         5
           botswana       0.00      0.00      0.00         1
             brazil       0.00      0.00      0.00        52
              burma

### En faisant varier les n-grammes

In [43]:
for min_N in range(1, 2):
  for max_N in range(min_N, 5):
    V = CountVectorizer(ngram_range = (min_N, max_N), analyzer = "char")
    X = V.fit_transform(texte)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    print(f"Ngram_range : ({min_N}, {max_N})")
    for i in liste:
        clf = i.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print('%s classifier : %.4f'%(i, score))

Ngram_range : (1, 1)
LogisticRegression() classifier : 0.5795
RandomForestClassifier() classifier : 0.6965
SVC() classifier : 0.5570
KNeighborsClassifier() classifier : 0.6353
Ngram_range : (1, 2)
LogisticRegression() classifier : 0.5876
RandomForestClassifier() classifier : 0.7413
SVC() classifier : 0.5570
KNeighborsClassifier() classifier : 0.6915
Ngram_range : (1, 3)
LogisticRegression() classifier : 0.5928
RandomForestClassifier() classifier : 0.7705
SVC() classifier : 0.5570
KNeighborsClassifier() classifier : 0.7053
Ngram_range : (1, 4)
LogisticRegression() classifier : 0.5930
RandomForestClassifier() classifier : 0.7734
SVC() classifier : 0.5570
KNeighborsClassifier() classifier : 0.7005


### Forêt aléatoire : zoom sur quelques paramètres

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
    # On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Bons résultats 5028
Erreurs: 1446
Accuracy: 0.78


In [45]:
# ci-dessus : paramètres par défaut

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

for i in range(1,20):
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
    print("Avec max_depth =", i)
    print('Bons résultats %d' % (y_test == y_pred).sum())
    print('Erreurs: %d' % (y_test != y_pred).sum())
    print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Avec max_depth = 1
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 2
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 3
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 4
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 5
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 6
Bons résultats 3632
Erreurs: 2842
Accuracy: 0.56
Avec max_depth = 7
Bons résultats 3659
Erreurs: 2815
Accuracy: 0.57
Avec max_depth = 8
Bons résultats 3679
Erreurs: 2795
Accuracy: 0.57
Avec max_depth = 9
Bons résultats 3703
Erreurs: 2771
Accuracy: 0.57
Avec max_depth = 10
Bons résultats 3723
Erreurs: 2751
Accuracy: 0.58
Avec max_depth = 11
Bons résultats 3767
Erreurs: 2707
Accuracy: 0.58
Avec max_depth = 12
Bons résultats 3796
Erreurs: 2678
Accuracy: 0.59
Avec max_depth = 13
Bons résultats 3813
Erreurs: 2661
Accuracy: 0.59
Avec max_depth = 14
Bons résultats 4003
Erreurs: 2471
Accuracy: 0.62
Avec max_depth = 15
Bons résultats 4025
Err

### En supprimant la ponctuation et les stopwords

In [None]:
# rappels pour la cellule ci-après (ces éléments de codes ont été exécutés plus avant)
# LR = LogisticRegression()
# RDF = RandomForestClassifier()
# SVC = SVC()
# KN = KNeighborsClassifier()
# Liste = [LR, RDF, SVC, KN]

In [47]:
from nltk.corpus import stopwords

In [48]:
import re

In [49]:
def remove_ponctuation(chaine):
    ponctuations = [",", "'", '"', "-", "\."]
    for stopword in ponctuations:
        chaine = re.sub(f" {stopword} ", " ", chaine)
    return chaine

In [50]:
def remove_stopwords(chaine):
    final_stopwords_list = stopwords.words('english')
    s = chaine
    for stopword in final_stopwords_list:
        chaine = re.sub(f" {stopword} ", " ", chaine)
    return chaine

In [None]:
for pretraitement in ["stopwords", "ponctuation"]:
    if pretraitement =="stopwords":
        liste_pretraite = [remove_stopwords(j) for j in texte]
        X = V.fit_transform(liste_pretraite)
    elif pretraitement =="ponctuation":
        liste_titres_pretraite = [remove_ponctuation(j) for j in texte]
        X = V.fit_transform(liste_pretraite)    
    else:
        X = V.fit_transform(texte)
    for split_size in [0.3]: #[0.1, 0.2, 0.3, 0.9]:
        print(f"Split_size : {split_size}, Pretraitement: {pretraitement}")
    #découpage train VS test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_size, random_state=0)
    ##classification
        for i in liste:
            i = i.fit(X_train, y_train)
            y_pred = i.predict(X_test)
            matrice_confusion = confusion_matrix(y_test, y_pred)
            print(i)
            print(matrice_confusion)
            stats = precision_recall_fscore_support(y_test, y_pred)
            print(stats)
            report = classification_report(y_test, y_pred)
            print(report)
            print('Bons résultats %d' % (y_test == y_pred).sum())
            print('Erreurs: %d' % (y_test != y_pred).sum())
            print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
            print("-"*10)

Split_size : 0.3, Pretraitement: stopwords
LogisticRegression()
[[415   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   0 ...   0   0   0]
 ...
 [  3   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.34354305, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.34920635, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.5       , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.5270936 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
      

RandomForestClassifier()
[[532   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  0   0   0 ...   5   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.86363636, 0.        , 1.        , 0.        , 0.7804878 ,
       0.        , 1.        , 0.75      , 0.9       , 0.        ,
       1.        , 0.        , 0.8125    , 0.        , 0.        ,
       0.74637681, 0.        , 0.        , 0.96428571, 1.        ,
       0.        , 1.        , 0.        , 1.        , 0.        ,
       0.        , 0.8       , 1.        , 0.        , 1.        ,
       0.77777778, 0.        , 1.        , 0.        , 0.        ,
       1.        , 1.        , 1.        , 0.86956522, 0.75      ,
       0.        , 1.        , 0.        , 0.76683938, 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.78571429, 0.        , 1.        , 0.        ,
       0.93939394, 1.        , 0.        , 0.

SVC()
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
  

KNeighborsClassifier()
[[528   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  1   0  11 ...   0   0   0]
 ...
 [  0   0   0 ...   3   0   0]
 [  2   0   0 ...   0   1   0]
 [  5   0   0 ...   0   0   0]]
(array([0.65346535, 0.        , 0.34375   , 0.        , 0.39393939,
       0.        , 0.125     , 0.09090909, 0.23880597, 0.        ,
       0.125     , 0.        , 0.41935484, 0.        , 0.        ,
       0.36498516, 0.5       , 0.        , 0.36111111, 0.83333333,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.5       , 0.        , 0.        , 0.25      ,
       0.43956044, 0.        , 0.        , 0.        , 0.        ,
       0.33333333, 0.        , 0.4       , 0.58333333, 0.85714286,
       0.        , 0.28571429, 0.25      , 0.        , 0.54787234,
       0.        , 0.        , 0.        , 0.        , 0.25      ,
       1.        , 0.        , 0.44444444, 0.        , 0.        ,
       0.        , 0.56      , 0.81818182, 0.  

Split_size : 0.3, Pretraitement: ponctuation
LogisticRegression()
[[415   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   0 ...   0   0   0]
 ...
 [  3   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.34354305, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.34920635, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.5       , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.5270936 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
    

RandomForestClassifier()
[[554   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   6 ...   0   0   0]
 ...
 [  0   0   0 ...   7   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.86292835, 0.        , 1.        , 0.        , 0.7804878 ,
       0.        , 1.        , 0.8       , 0.76190476, 0.        ,
       1.        , 0.        , 0.82222222, 0.        , 0.        ,
       0.74264706, 0.        , 0.        , 0.90322581, 1.        ,
       0.        , 1.        , 0.        , 0.66666667, 0.        ,
       0.        , 0.6875    , 1.        , 0.        , 1.        ,
       0.74603175, 0.        , 1.        , 0.        , 0.        ,
       1.        , 1.        , 1.        , 0.9047619 , 1.        ,
       0.        , 1.        , 0.        , 0.76842105, 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.83333333, 0.        , 1.        , 0.        ,
       0.93333333, 0.95238095, 0.        , 0.