# Jeu de données 1 : Reuters

In [None]:
# 0 indique que ce n'est pas un spam
# 1 indique que c'est un spam

## Lecture du jeu de données et comptes

In [1]:
class ReutersSGMLParser():
    """A helper class for parsing Reuters-21578 XGML file formats"""
    def __init__(self):
        self.bad_char_pattern = re.compile(r"&#\d*;")
        self.document_pattern = re.compile(r"<REUTERS.*?<\/REUTERS>", re.S)
        self.date_pattern = re.compile(r'[0-9]+-[A-Z]{3}-[0-9]{4} *[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]+')

    def empty_row(self):
        """Get an empty rows which can be transformed into a dataframe"""
        rows = {
            'old_id'     : [],
            'new_id'     : [],
            'has_topics' : [],
            'date'       : [],
            'topics'     : [],
            'places'     : [],
            'people'     : [],
            'orgs'       : [],
            'exchanges'  : [],
            'companies'  : [],
            'title'      : [],
            'dateline'   : [],
            'body'       : [],
            'author'     : [],
            'cgi_split'  : [],
            'lewis_split': []
        }
        return rows

    def get_text(self, elem, tagname, d_tag = False):
        """Get the text of a tag or empty string"""
        txt = getattr(elem, tagname, '')
        if txt == '':
            return ''
        if d_tag:
            txt = txt.D
        txt = txt.text.strip()
        return txt

    def get_date(self, elem, tagname):
        """Get the datetime of a tag or empty string"""
        date_str = getattr(elem, tagname, '')
        if date_str == '':
            return ''
        date_str = date_str.text.strip()
        try:
            date_str = self.date_pattern.findall(date_str)[0]
        except IndexError as ie:
            print('Cannot find date patter in: %s' % date_str)
            return ''
        date = datetime.strptime(date_str, '%d-%b-%Y %H:%M:%S.%f')
        return date

    def parse_header(self, rows, doc):
        """parse the header.
        e.g. <REUTERS TOPICS="YES" LEWISSPLIT="TRAIN" CGISPLIT="TRAINING-SET" OLDID="5544" NEWID="1">"""
        items = dict(doc.items())
        rows[   'old_id'  ].append(items.get('OLDID', ''))
        rows[   'new_id'  ].append(items.get('NEWID', ''))
        rows[ 'has_topics'].append(bool(items.get('TOPICS', '')))
        rows[ 'cgi_split' ].append(items.get('CGISPLIT', ''))
        rows['lewis_split'].append(items.get('LEWISSPLIT', ''))

    def parse_string(self, str):
        # remove bad characters
        xml_data = self.bad_char_pattern.sub('', str)
        # find documents
        documents = self.document_pattern.findall(xml_data)
        # parse document's elements
        rows = self.empty_row()
        for doc in documents:
            xml_doc = objectify.fromstring(doc)
            # parse attributes of the header
            self.parse_header(rows, xml_doc)
            # read DATE
            rows[  'date'  ].append(self.get_date(xml_doc, 'DATE'))
            # read TOPICS
            rows[  'topics'  ].append(self.get_text(xml_doc,'TOPICS', True))
            # read PLACES
            rows[  'places'  ].append(self.get_text(xml_doc, 'PLACES', True))
            # read PEOPLE
            rows[ 'people'  ].append(self.get_text(xml_doc, 'PEOPLE', True))
            # read ORGS
            rows[ 'orgs'  ].append(self.get_text(xml_doc, 'ORGS', True))
            # read EXCHANGES
            rows[ 'exchanges'  ].append(self.get_text(xml_doc, 'EXCHANGES', True))
            # read COMPANIES
            rows[ 'companies'  ].append(self.get_text(xml_doc, 'COMPANIES', True))
            # read the TEXT tag
            text = xml_doc.TEXT
            rows[ 'title'  ].append(self.get_text(text, 'TITLE'))
            rows['dateline'].append(self.get_text(text, 'DATELINE'))
            rows[  'body'  ].append(self.get_text(text, 'BODY'))
            rows[  'author'  ].append(self.get_text(text, 'AUTHOR'))
        return rows

    def parse(self, path):
        """parse a file from the Reuters dataset
        """
        # open xml file
        xml_data = ''
        try:
            xml_data = open(path, 'r', encoding="utf-8").read()
        except UnicodeDecodeError as ude:
            print('Failed to read %s as utf-8' % path)
            lines = []
            for line in open(path, 'rb').readlines():
                line = line.decode('utf-8','ignore') #.encode("utf-8")
                lines.append(line)
            xml_data = '\n'.join(lines)
        return self.parse_string(xml_data)

In [2]:
import pandas as pd
import re
from lxml import etree
from lxml import objectify
from datetime import datetime

Liste = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

parser = ReutersSGMLParser()
data = parser.empty_row()
for j in Liste :
    for path in  ['Reuters/reuters21578/reut2-0%i.sgm'%j]:
    # parse current document
        rows = parser.parse(path)
    # append rows into dataset
        for key in data.keys():
            data[key] = data[key] + rows[key]

df = pd.DataFrame(data, columns=data.keys())
#df = df.astype(dtype= {"date":"datetime64[]"})
df.head()

#il y a 20578 articles

lieu = []
for i in df.places:
    lieu.append(i)
#print(lieu)
#print(type(lieu))

texte = []
for j in df.body:
    texte.append(j)
print(len(texte))
print(type(texte))

Failed to read Reuters/reuters21578/reut2-017.sgm as utf-8
Cannot find date patter in: 31-MAR-1987 605:12:19.12
21578
<class 'list'>


In [3]:
import nltk
from nltk.tokenize import word_tokenize

In [4]:
import nltk
from nltk.tokenize import word_tokenize
# tokenizer comprenant les mots avec une apostrophe (l', qu') et les ponctuations séparément
# une telle tokenization nous permet d'avoir une bonne idée du nombre de token
nb_instances= 0
for i in texte:
    l=len(word_tokenize(i))
    nb_instances+=l
print("Il y a", nb_instances, " tokens en incluant la ponctuation.") # nombre de tokens avec ponctuation

import re
nb_instances2= 0
for i in texte:
    j = re.sub(",|;|\.", " ", i)
    l2=len(word_tokenize(j))
    nb_instances2 += l2
print("Il y a", nb_instances2, " tokens sans inclure la ponctuation.") # nombre de tokens sans ponctuation

from nltk.tokenize import sent_tokenize
nb_instances3= 0
for i in texte:
    longueur_text=len(sent_tokenize(i))
    nb_instances3 += longueur_text
print("Il y a", nb_instances3, "phrases dans l'ensemble du corpus.")

Il y a 2852163  tokens en incluant la ponctuation.
Il y a 2725273  tokens sans inclure la ponctuation.
Il y a 123448 phrases dans l'ensemble du corpus.


## Vectorisation du jeu de données

In [4]:
## Récupérer les instances (X) et les classes (y) et vectoriser
y = lieu

from sklearn.feature_extraction.text import CountVectorizer
V = CountVectorizer(ngram_range = (1,2) )
X = V.fit_transform(texte)

## séparer train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [59]:
test = 0.3*21578
train = 21578 - test

In [60]:
print("0,3 pour la taille du test soit environ", test, "articles. Donc environ", train, "articles pour le train.")

0,3 pour la taille du test soit environ 6473.4 articles. Donc environ 15104.6 articles pour le train.


## Classifications et Evaluations

### Le perceptron, réseau de neurones simple, classifieur linéaire

In [5]:
from sklearn.linear_model import Perceptron

In [8]:
#classifier

ppn = Perceptron(eta0=0.1, random_state=0)
ppn.fit(X_train, y_train)
y_pred = ppn.predict(X_test)

# On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())

Bons résultats 5350
Erreurs: 1124


**Les données de classification : elles permettent d'évaluer la qualité de la classification. Ici, on calcule ces données avec les résultats donnés par le Perceptron.**

In [6]:
from sklearn.metrics import classification_report

In [10]:

#Classification report permet de msurer l'exactitude d'une clissification selon plusieurs paramètres

report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

                          0.83      0.89      0.86       811
            algeria       0.00      0.00      0.00         2
             angola       0.00      0.00      0.00         0
          argentina       0.92      0.73      0.81        15
              aruba       0.00      0.00      0.00         1
          australia       0.83      0.75      0.79        60
            austria       0.60      0.60      0.60         5
            bahrain       0.57      0.50      0.53         8
         bangladesh       0.00      0.00      0.00         6
           barbados       0.00      0.00      0.00         0
            belgium       0.69      0.48      0.57        56
            bermuda       0.00      0.00      0.00         2
            bolivia       1.00      0.60      0.75         5
           botswana       0.00      0.00      0.00         1
             brazil       0.84      0.62      0.71        52
              burma    

In [12]:
# la précision est la proportion des items pertinents parmi l'ensemble des items proposés 

In [13]:
# le rappel est la proportion des items pertinents proposés parmi l'ensemble des items pertinents. 

In [14]:
# Vrai négatif : absent, absent
# Vrai positif : présent, présent
# Faux négatif : présent(dans la référence), absent(dans l'hypothèse)
# Faux positif : absent, présent

In [15]:
# support : nombre d'instances concernées

# micro f-mesure : moyenne des F-mesure pondérée (une classe compte en fonction de sa taille)

# macro f-mesure : moyenne des F-mesure de chaque classe (indépendamment de leur taille)

In [7]:
from sklearn.metrics import confusion_matrix

In [9]:
#La matrice de confusion

matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)

[[722   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   4   0]
 [  0   0   0 ...   0   0   2]]


### Deuxième évaluation : Un arbre de décision

In [8]:
from sklearn import tree

In [10]:
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)

In [14]:
# encore une matrice de confusion
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)

print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())

[[697   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4921
Erreurs: 1553


In [9]:
from sklearn.metrics import classification_report

In [66]:
#Classification report permet de msurer l'exactitude d'une clissification selon plusieurs paramètres

report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

                          0.77      0.86      0.81       811
            algeria       0.00      0.00      0.00         2
          argentina       0.50      0.40      0.44        15
              aruba       0.00      0.00      0.00         1
          australia       0.53      0.55      0.54        60
            austria       0.00      0.00      0.00         5
            bahrain       0.00      0.00      0.00         8
         bangladesh       0.57      0.67      0.62         6
            belgium       0.43      0.34      0.38        56
            bermuda       0.00      0.00      0.00         2
            bolivia       1.00      0.20      0.33         5
           botswana       0.00      0.00      0.00         1
             brazil       0.50      0.60      0.54        52
              burma       0.00      0.00      0.00         1
           cameroon       0.00      0.00      0.00         1
             canada    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
# résultats moins bons que précédemment.

## Approfondissement

**Regardons l'impact du paramètres random_state**

In [10]:
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

In [68]:
print("Avec la valeur par défaut de random state")
for i in range(3):
  DT = tree.DecisionTreeClassifier()
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print(matrice_confusion)
  stats = precision_recall_fscore_support(y_test, y_pred)
  print(stats)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
  print("--"*10)

Avec la valeur par défaut de random state
[[698   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
(array([0.78163494, 0.        , 0.41176471, 0.        , 0.484375  ,
       0.        , 0.5       , 0.4       , 0.375     , 0.        ,
       0.        , 0.        , 0.65853659, 0.        , 0.        ,
       0.48507463, 0.        , 0.        , 0.61538462, 0.33333333,
       0.        , 1.        , 0.        , 0.75      , 0.        ,
       0.        , 0.75      , 1.        , 0.        , 0.33333333,
       0.54807692, 0.        , 0.        , 0.        , 0.        ,
       0.2962963 , 1.        , 0.18181818, 0.76923077, 0.28571429,
       0.        , 0.        , 0.66666667, 0.25      , 0.        ,
       0.64754098, 0.        , 0.        , 0.375     , 0.        ,
       0.25      , 0.        , 0.        , 0.6       , 0.        ,
       0.2       , 0.       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[698   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  1   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
(array([0.78163494, 0.        , 0.5       , 0.        , 0.44927536,
       0.        , 0.125     , 0.75      , 0.39130435, 0.        ,
       0.5       , 0.        , 0.55555556, 0.        , 0.        ,
       0.        , 0.49593496, 0.        , 0.        , 0.73529412,
       0.3       , 0.        , 0.66666667, 0.        , 0.33333333,
       0.        , 0.        , 0.58823529, 0.5       , 0.        ,
       0.        , 0.        , 0.36363636, 0.58333333, 0.        ,
       0.        , 0.        , 0.        , 0.27272727, 0.5       ,
       0.2       , 0.63157895, 0.375     , 0.        , 0.        ,
       0.        , 0.66666667, 0.33333333, 0.        , 0.67226891,
       0.        , 0.        , 0.33333333, 0.        , 0.33333333,
       0.        , 0.        , 0.45      , 0.        , 0.14285714,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[697   0   1 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 [  1   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
(array([0.77187154, 0.        , 0.27272727, 0.        , 0.51612903,
       0.        , 0.        , 0.54545455, 0.40909091, 0.        ,
       1.        , 0.        , 0.6       , 0.        , 0.        ,
       0.        , 0.51405622, 0.        , 0.        , 0.65714286,
       0.5       , 0.        , 0.5       , 0.        , 0.57142857,
       0.        , 0.        , 0.69230769, 0.14285714, 0.        ,
       0.4       , 0.57142857, 0.        , 0.        , 0.        ,
       0.        , 0.36      , 1.        , 0.28571429, 0.6       ,
       0.3       , 0.        , 0.        , 0.        , 0.65217391,
       0.5       , 0.66260163, 0.        , 0.        , 0.75      ,
       0.        , 0.33333333, 0.        , 0.        , 0.5625    ,
       0.        , 0.125     , 0.        , 0.58490566, 0.60606061,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
print("En fixant random state")
for i in range(3):
  DT = tree.DecisionTreeClassifier(random_state=0)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print(matrice_confusion)
  stats = precision_recall_fscore_support(y_test, y_pred)
  print(stats)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))


En fixant random state
[[694   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
(array([0.77802691, 0.        , 0.36842105, 0.        , 0.49206349,
       0.        , 0.09090909, 0.33333333, 0.38888889, 0.        ,
       0.2       , 0.        , 0.54716981, 0.        , 0.        ,
       0.47583643, 0.        , 0.        , 0.65714286, 0.4       ,
       0.        , 0.5       , 0.        , 1.        , 0.        ,
       0.        , 0.55      , 0.33333333, 0.        , 0.5       ,
       0.57291667, 0.        , 0.        , 0.        , 0.        ,
       0.31428571, 0.14285714, 0.18181818, 0.78571429, 0.33333333,
       0.        , 0.        , 0.        , 0.7       , 0.25      ,
       0.        , 0.63709677, 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.5       , 0.        , 0.        ,
       0.6       , 0.        , 0.2       , 0.  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[694   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
(array([0.77802691, 0.        , 0.36842105, 0.        , 0.49206349,
       0.        , 0.09090909, 0.33333333, 0.38888889, 0.        ,
       0.2       , 0.        , 0.54716981, 0.        , 0.        ,
       0.47583643, 0.        , 0.        , 0.65714286, 0.4       ,
       0.        , 0.5       , 0.        , 1.        , 0.        ,
       0.        , 0.55      , 0.33333333, 0.        , 0.5       ,
       0.57291667, 0.        , 0.        , 0.        , 0.        ,
       0.31428571, 0.14285714, 0.18181818, 0.78571429, 0.33333333,
       0.        , 0.        , 0.        , 0.7       , 0.25      ,
       0.        , 0.63709677, 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.5       , 0.        , 0.        ,
       0.6       , 0.        , 0.2       , 0.        , 0.58490566,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[694   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
(array([0.77802691, 0.        , 0.36842105, 0.        , 0.49206349,
       0.        , 0.09090909, 0.33333333, 0.38888889, 0.        ,
       0.2       , 0.        , 0.54716981, 0.        , 0.        ,
       0.47583643, 0.        , 0.        , 0.65714286, 0.4       ,
       0.        , 0.5       , 0.        , 1.        , 0.        ,
       0.        , 0.55      , 0.33333333, 0.        , 0.5       ,
       0.57291667, 0.        , 0.        , 0.        , 0.        ,
       0.31428571, 0.14285714, 0.18181818, 0.78571429, 0.33333333,
       0.        , 0.        , 0.        , 0.7       , 0.25      ,
       0.        , 0.63709677, 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.5       , 0.        , 0.        ,
       0.6       , 0.        , 0.2       , 0.        , 0.58490566,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Différents paramètres que l'on peut faire varier : max_depth, min_samples_split, min_samples_leaf et max_features**

In [70]:
print("On teste max_depth")
for i in range(1, 100):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste max_depth
Avec max_depth= 1
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4155
Erreurs: 2319
Accuracy: 0.64
Avec max_depth= 2
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4281
Erreurs: 2193
Accuracy: 0.66
Avec max_depth= 3
[[631   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4284
Erreurs: 2190
Accuracy: 0.66
Avec max_depth= 4
[[634   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0 

Avec max_depth= 33
[[680   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   4   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4875
Erreurs: 1599
Accuracy: 0.75
Avec max_depth= 34
[[682   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   4   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4874
Erreurs: 1600
Accuracy: 0.75
Avec max_depth= 35
[[683   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   4   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4891
Erreurs: 1583
Accuracy: 0.76
Avec max_depth= 36
[[682   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   4   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résu

Avec max_depth= 65
[[698   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   7   0   0]
 [  0   0   2 ...   0   4   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4963
Erreurs: 1511
Accuracy: 0.77
Avec max_depth= 66
[[695   0   1 ...   0   0   2]
 [  0   0   0 ...   0   0   0]
 [  1   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   0 ...   0   5   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4987
Erreurs: 1487
Accuracy: 0.77
Avec max_depth= 67
[[696   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   8 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4975
Erreurs: 1499
Accuracy: 0.77
Avec max_depth= 68
[[698   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   2 ...   0   2   0]
 [  0   0   0 ...   0   0   0]]
Bons résu

Avec max_depth= 97
[[697   0   1 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   1 ...   5   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   1 ...   0   0   0]]
Bons résultats 4960
Erreurs: 1514
Accuracy: 0.77
Avec max_depth= 98
[[697   0   1 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  1   0   1 ...   5   0   0]
 [  0   0   0 ...   0   4   0]
 [  0   0   1 ...   0   0   0]]
Bons résultats 4950
Erreurs: 1524
Accuracy: 0.76
Avec max_depth= 99
[[698   0   1 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   6 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4958
Erreurs: 1516
Accuracy: 0.77


Meilleur résultat obtenu :
Avec max_depth= 96
[[1265   49]
 [  30  375]]
(array([0.97683398, 0.88443396]), array([0.96270928, 0.92592593]), array([0.9697202 , 0.90470446]), array([1314,  405]))
Bons résultats 1640
Erreurs: 79

In [15]:
print("On teste min_samples_split:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste min_samples_split:
Avec max_depth= 1
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4155
Erreurs: 2319
Accuracy: 0.64
Avec max_depth= 2
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4281
Erreurs: 2193
Accuracy: 0.66
Avec max_depth= 3
[[631   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4284
Erreurs: 2190
Accuracy: 0.66
Avec max_depth= 4
[[634   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...

In [8]:
print("On teste min_samples_leaf:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste min_samples_leaf:
Avec max_depth= 1
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4155
Erreurs: 2319
Accuracy: 0.64
Avec max_depth= 2
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4281
Erreurs: 2193
Accuracy: 0.66
Avec max_depth= 3
[[631   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4284
Erreurs: 2190
Accuracy: 0.66
Avec max_depth= 4
[[634   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ... 

In [9]:
print("On teste max_features:")
for i in range(1, 20):
  DT = tree.DecisionTreeClassifier(max_depth=i)
  DT = DT.fit(X_train, y_train)
  y_pred = DT.predict(X_test)
  matrice_confusion = confusion_matrix(y_test, y_pred)
  print("Avec max_depth=", i)
  print(matrice_confusion)
  print('Bons résultats %d' % (y_test == y_pred).sum())
  print('Erreurs: %d' % (y_test != y_pred).sum())
  print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

On teste max_features:
Avec max_depth= 1
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4155
Erreurs: 2319
Accuracy: 0.64
Avec max_depth= 2
[[630   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4281
Erreurs: 2193
Accuracy: 0.66
Avec max_depth= 3
[[631   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Bons résultats 4284
Erreurs: 2190
Accuracy: 0.66
Avec max_depth= 4
[[634   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0 

In [10]:
# pour ces trois derniers paramètres on ne passe jamais sous la barre des 85 erreurs.

### Autres caracétristiques

In [None]:
import statistics
from nltk.tokenize import sent_tokenize

nb_phrases= 0
for i in texte:
    longueur_text=len(sent_tokenize(i))
    nb_phrases += longueur_text
    
X_stylo = []#notre nouvelle matrice de description
for text in texte:
    mots=word_tokenize(text)
    phrases = sent_tokenize(text)
    NB_mots = len(mots)
    NB_caracteres = len(text)
    for x in mots :
        moyenne_taille_mots = statistics.mean([len(x)])
    moyenne_taille_phrases = NB_mots/nb_phrases
    caracteristiques = [nb_phrases, NB_mots, NB_caracteres, moyenne_taille_mots, moyenne_taille_phrases]
    X_stylo.append(caracteristiques)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_stylo, y, test_size=0.3, random_state=0)
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)
stats = precision_recall_fscore_support(y_test, y_pred)
print(stats)
report = classification_report(y_test, y_pred)
print(report)
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[[647   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.66358974, 0.        , 0.15384615, 0.        , 0.09210526,
       0.        , 0.        , 0.        , 0.        , 0.04166667,
       0.        , 0.        , 0.        , 0.04166667, 0.        ,
       0.        , 0.        , 0.08090615, 0.        , 0.        ,
       0.02941176, 0.        , 0.        , 1.        , 0.        ,
       0.2       , 0.        , 0.        , 0.04166667, 0.5       ,
       0.        , 0.        , 0.08333333, 0.00943396, 0.        ,
       0.        , 0.        , 0.        , 0.06818182, 0.33333333,
       0.        , 0.2       , 0.07692308, 0.        , 0.        ,
       0.        , 0.03448276, 0.125     , 0.        , 0.2034632 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# taux d'exactitude très mauvais puisqu'on fait moins que le hasard.

**Et maintenant on combine le BOW et le stylométrique**

In [32]:
## on regarde la "forme" de X
print(X.shape[0])#NB lignes   -> instances
print(X.shape[1])#Nb colonnes -> caractéristiques

##on crée une sparse matrix avec notre X_stylo
from scipy.sparse import csr_matrix
sparse_stylo = csr_matrix(X_stylo)
print(sparse_stylo.shape[0])#NB lignes   -> instances
print(sparse_stylo.shape[1])#Nb colonnes -> caractéristiques

## on a le même nombre de ligne, on fait donc une conctaténation horizontale :
from scipy.sparse import hstack
X_fusion = hstack((X, sparse_stylo))

21578
671682
21578
5


**Résultats:**

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_fusion, y, test_size=0.3, random_state=0)
DT = tree.DecisionTreeClassifier()
DT = DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
matrice_confusion = confusion_matrix(y_test, y_pred)
print(matrice_confusion)
report = classification_report(y_test, y_pred)
print(report)
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

[[697   0   2 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   4 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
                     precision    recall  f1-score   support

                          0.78      0.86      0.82       811
            algeria       0.00      0.00      0.00         2
          argentina       0.19      0.27      0.22        15
              aruba       0.00      0.00      0.00         1
          australia       0.54      0.47      0.50        60
            austria       0.00      0.00      0.00         5
            bahrain       0.14      0.12      0.13         8
         bangladesh       0.56      0.83      0.67         6
            belgium       0.52      0.39      0.45        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.00      0.00      0.00         5
           botswana       0.00      0.00      0.00         1
             b

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#résultats moyens

## Autres classifieurs

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings 
warnings.filterwarnings("ignore")

LR = LogisticRegression()
RDF = RandomForestClassifier()
SVC = SVC()
KN = KNeighborsClassifier()

In [None]:
liste = [LR, RDF]

for i in liste:
    i = i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    matrice_confusion = confusion_matrix(y_test, y_pred)
    print(i)
    print(matrice_confusion)
    report = classification_report(y_test, y_pred)
    print(report)
    print('Bons résultats %d' % (y_test == y_pred).sum())
    print('Erreurs: %d' % (y_test != y_pred).sum())
    print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print("-"*10)

LogisticRegression()
[[723   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  1   0   8 ...   0   0   0]
 ...
 [  1   0   0 ...   5   0   1]
 [  0   0   0 ...   0   2   0]
 [  0   0   0 ...   0   0   0]]
                     precision    recall  f1-score   support

                          0.83      0.89      0.86       811
            algeria       0.00      0.00      0.00         2
          argentina       0.89      0.53      0.67        15
              aruba       0.00      0.00      0.00         1
          australia       0.80      0.75      0.78        60
            austria       1.00      0.20      0.33         5
            bahrain       0.50      0.38      0.43         8
         bangladesh       0.50      0.17      0.25         6
            belgium       0.52      0.62      0.57        56
            bermuda       0.00      0.00      0.00         2
            bolivia       1.00      0.40      0.57         5
           botswana       0.00      0.00      0.00    

In [17]:
liste2 = [SVC, KN] # on divise parce que c'est très long à charger et le notebook plante
for i in liste2:
    i = i.fit(X_train, y_train)
    y_pred = i.predict(X_test)
    matrice_confusion = confusion_matrix(y_test, y_pred)
    print(i)
    print(matrice_confusion)
    report = classification_report(y_test, y_pred)
    print(report)
    print('Bons résultats %d' % (y_test == y_pred).sum())
    print('Erreurs: %d' % (y_test != y_pred).sum())
    print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print("-"*10)

SVC()
[[716   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  3   0   1 ...   0   0   0]
 ...
 [  1   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
                     precision    recall  f1-score   support

                          0.84      0.88      0.86       811
            algeria       0.00      0.00      0.00         2
          argentina       1.00      0.07      0.12        15
              aruba       0.00      0.00      0.00         1
          australia       0.83      0.08      0.15        60
            austria       0.00      0.00      0.00         5
            bahrain       0.00      0.00      0.00         8
         bangladesh       0.00      0.00      0.00         6
            belgium       0.57      0.23      0.33        56
            bermuda       0.00      0.00      0.00         2
            bolivia       0.00      0.00      0.00         5
           botswana       0.00      0.00      0.00         1
        

In [None]:
# meilleurs résultats avec le LogisticRegression : ce sont les meilleurs obtenus jusqu'à maintenant

### En faisant varier les n-grammes

In [None]:
# ici aussi on divise en deux les modules

In [21]:
for min_N in range(1, 2):
  for max_N in range(min_N, 4):
    V = CountVectorizer(ngram_range = (min_N, max_N), analyzer = "char")
    X = V.fit_transform(texte)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    print(f"Ngram_range : ({min_N}, {max_N})")
    for i in liste:
        clf = i.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print('%s classifier : %.4f'%(i, score))

Ngram_range : (1, 1)
LogisticRegression() classifier : 0.5366
RandomForestClassifier() classifier : 0.6850
Ngram_range : (1, 2)
LogisticRegression() classifier : 0.6168
RandomForestClassifier() classifier : 0.6969
Ngram_range : (1, 3)
LogisticRegression() classifier : 0.7521
RandomForestClassifier() classifier : 0.7028
Ngram_range : (1, 4)
LogisticRegression() classifier : 0.7728
RandomForestClassifier() classifier : 0.7071


In [None]:
for min_N in range(1, 2):
  for max_N in range(min_N, 5):
    V = CountVectorizer(ngram_range = (min_N, max_N), analyzer = "char")
    X = V.fit_transform(texte)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    print(f"Ngram_range : ({min_N}, {max_N})")
    for i in liste2:
        clf = i.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print('%s classifier : %.4f'%(i, score))

### Forêt aléatoire : zoom sur quelques paramètres

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
    # On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
print('Bons résultats %d' % (y_test == y_pred).sum())
print('Erreurs: %d' % (y_test != y_pred).sum())
print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Bons résultats 4591
Erreurs: 1883
Accuracy: 0.71


In [None]:
# ci-dessus : paramètres par défaut

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

for i in range(1,20):
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # On fait la somme de tous les cas où la valeur dans y_test est bien trouvée dans y_pred
    print("Avec max_depth =", i)
    print('Bons résultats %d' % (y_test == y_pred).sum())
    print('Erreurs: %d' % (y_test != y_pred).sum())
    print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Avec max_depth = 1
Bons résultats 3606
Erreurs: 2868
Accuracy: 0.56
Avec max_depth = 2
Bons résultats 4158
Erreurs: 2316
Accuracy: 0.64
Avec max_depth = 3
Bons résultats 4159
Erreurs: 2315
Accuracy: 0.64
Avec max_depth = 4
Bons résultats 4162
Erreurs: 2312
Accuracy: 0.64
Avec max_depth = 5
Bons résultats 4167
Erreurs: 2307
Accuracy: 0.64
Avec max_depth = 6
Bons résultats 4199
Erreurs: 2275
Accuracy: 0.65
Avec max_depth = 7
Bons résultats 4204
Erreurs: 2270
Accuracy: 0.65
Avec max_depth = 8
Bons résultats 4233
Erreurs: 2241
Accuracy: 0.65
Avec max_depth = 9
Bons résultats 4233
Erreurs: 2241
Accuracy: 0.65
Avec max_depth = 10
Bons résultats 4260
Erreurs: 2214
Accuracy: 0.66
Avec max_depth = 11
Bons résultats 4273
Erreurs: 2201
Accuracy: 0.66
Avec max_depth = 12
Bons résultats 4280
Erreurs: 2194
Accuracy: 0.66
Avec max_depth = 13
Bons résultats 4321
Erreurs: 2153
Accuracy: 0.67
Avec max_depth = 14
Bons résultats 4320
Erreurs: 2154
Accuracy: 0.67
Avec max_depth = 15
Bons résultats 4355
Err

KeyboardInterrupt: 

In [None]:
# amélioration des résultats jusqu'à max_depth = 95, puis augmentation du nombre d'erreurs
# mais moins bons résultats qu'avec les paramètres par défaut

### En supprimant la ponctuation et les stopwords

In [None]:
# rappels pour la cellule ci-après (ces éléments de codes ont été exécutés plus avant)
# LR = LogisticRegression()
# RDF = RandomForestClassifier()
# SVC = SVC()
# KN = KNeighborsClassifier()
# Liste = [LR, RDF, SVC, KN]

In [30]:
from nltk.corpus import stopwords

In [25]:
import re

In [26]:
def remove_ponctuation(chaine):
    ponctuations = [",", "'", '"', "-", "\."]
    for stopword in ponctuations:
        chaine = re.sub(f" {stopword} ", " ", chaine)
    return chaine

In [27]:
def remove_stopwords(chaine):
    final_stopwords_list = stopwords.words('english')
    s = chaine
    for stopword in final_stopwords_list:
        chaine = re.sub(f" {stopword} ", " ", chaine)
    return chaine

In [None]:
#je remarque en changeant d'ordinateur que Jupyter tourne plus rapidement alors je refonde en une liste les modules

In [33]:
liste3 = [LR, RDF, SVC, KN]

In [36]:
for pretraitement in ["stopwords", "ponctuation"]:
    if pretraitement =="stopwords":
        liste_pretraite = [remove_stopwords(j) for j in texte]
        X = V.fit_transform(liste_pretraite)
    elif pretraitement =="ponctuation":
        liste_titres_pretraite = [remove_ponctuation(j) for j in texte]
        X = V.fit_transform(liste_pretraite)    
    else:
        X = V.fit_transform(texte)
    for split_size in [0.3]: #[0.1, 0.2, 0.3, 0.9]:
        print(f"Split_size : {split_size}, Pretraitement: {pretraitement}")
    #découpage train VS test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_size, random_state=0)
    ##classification
        for i in liste3:
            i = i.fit(X_train, y_train)
            y_pred = i.predict(X_test)
            matrice_confusion = confusion_matrix(y_test, y_pred)
            print(i)
            print(matrice_confusion)
            stats = precision_recall_fscore_support(y_test, y_pred)
            print(stats)
            report = classification_report(y_test, y_pred)
            print(report)
            print('Bons résultats %d' % (y_test == y_pred).sum())
            print('Erreurs: %d' % (y_test != y_pred).sum())
            print ('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
            print("-"*10)

Split_size : 0.3, Pretraitement: stopwords
LogisticRegression()
[[723   0   0 ...   0   0   0]
 [  2   0   0 ...   0   0   0]
 [  1   0   9 ...   0   0   0]
 ...
 [  1   0   0 ...   3   0   0]
 [  0   0   0 ...   0   3   0]
 [  0   0   0 ...   0   0   1]]
(array([0.78844057, 0.        , 0.81818182, 0.        , 0.70689655,
       0.        , 0.5       , 0.        , 0.46296296, 0.        ,
       0.4       , 0.        , 0.57142857, 0.        , 0.        ,
       0.57777778, 0.        , 0.        , 0.63829787, 0.57142857,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.6875    , 1.        , 0.        , 1.        ,
       0.59090909, 0.        , 1.        , 0.        , 0.        ,
       0.7826087 , 0.5       , 0.27272727, 0.93333333, 0.54545455,
       0.        , 0.        , 0.51515152, 0.33333333, 0.73684211,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.66666667, 0.        , 0.33333333,
      

RandomForestClassifier()
[[661   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  1   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.84203822, 0.        , 1.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.8       , 0.        ,
       0.        , 0.        , 0.90909091, 0.        , 0.        ,
       0.80645161, 0.        , 0.        , 1.        , 1.        ,
       0.        , 1.        , 0.        , 1.        , 0.        ,
       0.        , 0.66666667, 1.        , 0.        , 1.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 1.        , 1.        , 0.90196078, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 0.        , 0.

SVC()
[[669   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.83834586, 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.75      , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.86363636, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.83673469, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.       

KNeighborsClassifier()
[[707   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  2   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  2   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
(array([0.74814815, 0.        , 0.5       , 0.        , 0.29310345,
       0.        , 0.        , 0.        , 0.26582278, 0.        ,
       0.4       , 0.        , 0.2962963 , 0.        , 0.        ,
       0.        , 0.38461538, 0.        , 0.        , 0.33333333,
       0.11111111, 1.        , 0.        , 0.25      , 0.375     ,
       0.        , 0.        , 0.21875   , 0.5       , 0.        ,
       0.        , 0.        , 0.38554217, 0.        , 0.        ,
       0.        , 0.        , 0.41666667, 0.        , 0.        ,
       0.33333333, 0.5       , 0.        , 0.        , 0.47368421,
       0.        , 0.5       , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.        , 0.75

Split_size : 0.3, Pretraitement: ponctuation
LogisticRegression()
[[723   0   0 ...   0   0   0]
 [  2   0   0 ...   0   0   0]
 [  1   0   9 ...   0   0   0]
 ...
 [  1   0   0 ...   3   0   0]
 [  0   0   0 ...   0   3   0]
 [  0   0   0 ...   0   0   1]]
(array([0.78844057, 0.        , 0.81818182, 0.        , 0.70689655,
       0.        , 0.5       , 0.        , 0.46296296, 0.        ,
       0.4       , 0.        , 0.57142857, 0.        , 0.        ,
       0.57777778, 0.        , 0.        , 0.63829787, 0.57142857,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.6875    , 1.        , 0.        , 1.        ,
       0.59090909, 0.        , 1.        , 0.        , 0.        ,
       0.7826087 , 0.5       , 0.27272727, 0.93333333, 0.54545455,
       0.        , 0.        , 0.51515152, 0.33333333, 0.73684211,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.66666667, 0.        , 0.33333333,
    

RandomForestClassifier()
[[669   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  1   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.84363178, 0.        , 1.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.83333333, 0.        ,
       0.        , 0.        , 0.90909091, 0.        , 0.        ,
       0.83333333, 0.        , 0.        , 1.        , 1.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 0.        , 1.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.        , 1.        , 1.        , 0.86915888, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 0.        , 0.

SVC()
[[669   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
(array([0.83834586, 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.75      , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.86363636, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.83673469, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.       

KNeighborsClassifier()
[[707   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  2   0   7 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  2   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   0]]
(array([0.74814815, 0.        , 0.5       , 0.        , 0.29310345,
       0.        , 0.        , 0.        , 0.26582278, 0.        ,
       0.4       , 0.        , 0.2962963 , 0.        , 0.        ,
       0.        , 0.38461538, 0.        , 0.        , 0.33333333,
       0.11111111, 1.        , 0.        , 0.25      , 0.375     ,
       0.        , 0.        , 0.21875   , 0.5       , 0.        ,
       0.        , 0.        , 0.38554217, 0.        , 0.        ,
       0.        , 0.        , 0.41666667, 0.        , 0.        ,
       0.33333333, 0.5       , 0.        , 0.        , 0.47368421,
       0.        , 0.5       , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.33333333,
       0.        , 0.        , 0.        , 0.75