In [20]:
# Des installation sont possibles
import nltk
#nltk.download()
#nltk.download('stopwords')

In [21]:
# Importation des librairies
import numpy as np 
import pandas as pd
import os
import random

import nltk
from nltk.corpus import stopwords
import string
import nltk
#nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [22]:
# Dataset extrait de : 'https://www.kaggle.com/code/kishanmodasiya/spam-ham-detection/data'

Table=r'spam_ham_dataset.csv'

In [23]:
# Lecture du csv

data = pd.read_csv(Table)

In [24]:
# Présentation dataset
print("######")
print("4 premières lignes")
print(data.head())
print("######")
print("Type :")
print(data.dtypes)
print("######")
print("Taille :")
print(data.shape)
print("######")
print("Titre des colonnes :")
print(data.columns)

# On va devoir split notre jeu de données en train et test
# Donnée importante : le texte
# 'label' indique si spam ou non

######
4 premières lignes
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  
######
Type :
Unnamed: 0     int64
label         object
text          object
label_num      int64
dtype: object
######
Taille :
(5171, 4)
######
Titre des colonnes :
Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')


In [25]:
#  Nous allons séparer 'text' en liste de mots pour une analyse pertinente
# et enlever la ponctuation

def traitement_text (text):
    ponctuation=[]
    mots =[]
    for word in text:
        if word not in string.punctuation:
            ponctuation.append(word)
    ponctuation = ''.join(ponctuation)
    for word in ponctuation.split():
        if word.lower() not in stopwords.words('english'):
            mots.append(word)
    return mots

#print(traitement_text(data['text']))
data['text'].head().apply(traitement_text)

0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
Name: text, dtype: object

In [26]:
# Transformation des différentes parties du texte en  vecteur
# pour permettre l'analyse

text_vecteur = CountVectorizer(analyzer=traitement_text).fit_transform(data['text'])

text_vecteur

# L'execution peut être assez longue

<5171x50381 sparse matrix of type '<class 'numpy.int64'>'
	with 374350 stored elements in Compressed Sparse Row format>

In [27]:
# Split en train et test

# Fixons l'aléatoire
random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(text_vecteur, data['label_num'], test_size=0.2, random_state=0)

In [28]:
# Matrice de confusion
# Score d'Accuracy

random.seed(42)
rfc = RandomForestClassifier().fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))
print("####")
print("Matrice de confusion\n", confusion_matrix(y_test, pred))
print("####")
print("Score d'accuracy\n", accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       732
           1       0.95      0.98      0.96       303

    accuracy                           0.98      1035
   macro avg       0.97      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035

####
Matrice de confusion
 [[716  16]
 [  6 297]]
####
Score d'accuracy
 0.978743961352657


In [29]:
# Best feature
# max_depth=None (test de 1 à 45)
# n_estimators=82 (test de 30 à 150)*
# criterion='gini' (test gini et entropy)

#Accuracy Score : 0.9806763285024155

In [30]:
# Boucle utile pour certains tests
# Si vous voulez lancer les tests, initialiser TEST=True et changer le paramètre à tester

TEST=False

if(TEST):
    for i in range(80,90,1):
        random.seed(42)
        print("I=",i)
        rfc = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=None).fit(X_train, y_train)
        pred = rfc.predict(X_test)
        print("Score d'accuracy\n", accuracy_score(y_test, pred))

In [31]:
# Meilleur score Accuracy - Score : 0.9806763285024155

random.seed(42)
rfc = RandomForestClassifier(n_estimators=82, criterion='gini', max_depth=None).fit(X_train, y_train)
pred = rfc.predict(X_test)

print(classification_report(y_test, pred))
print("####")
print("Matrice de confusion\n", confusion_matrix(y_test, pred))
print("####")
print("Score d'accuracy\n", accuracy_score(y_test, pred))


              precision    recall  f1-score   support

           0       0.99      0.97      0.98       732
           1       0.94      0.97      0.96       303

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

####
Matrice de confusion
 [[713  19]
 [  8 295]]
####
Score d'accuracy
 0.9739130434782609
