In [149]:
import urllib.request
import sys
import tarfile
import os
import numpy
import sklearn
import email
import email.policy
import numpy as np
import re
from html import unescape
from sklearn.model_selection import train_test_split
import collections
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
import string

# Loading data
 

In [150]:
def download_and_extract_dataset(file_names, urls, download_directory, dataset_type):
    for (file_name, url) in zip(file_names, urls):
        file_path = os.path.join(download_directory, file_name)
        if not os.path.isfile(file_path):
            urllib.request.urlretrieve(url, file_path)
        tar_file = tarfile.open(file_path)
        
        # Remove the path by resetting it
        members = []
        for member in tar_file.getmembers():
            if member.isreg():
                member.name = os.path.basename(member.name) 
                members.append(member)
        tar_file.extractall(path=os.path.join(download_directory, dataset_type), members=members)
        tar_file.close()

In [151]:
root = "https://spamassassin.apache.org/old/publiccorpus/"

ham1_url = root + "20021010_easy_ham.tar.bz2"

ham3_url = root + "20030228_easy_ham_2.tar.bz2"

ham5_url = root + "20030228_hard_ham.tar.bz2"

ham_url = [ham1_url, ham3_url, ham5_url]

ham_filename = ["ham1.tar.bz2", "ham3.tar.bz2", "ham5.tar.bz2"]

spam1_url = root + "20021010_spam.tar.bz2"

spam4_url = root + "20050311_spam_2.tar.bz2"

spam_url = [spam1_url, spam4_url]

spam_filename = ["spam1.tar.bz2", "spam4.tar.bz2"]

path = "./data/"

if not os.path.isdir(path):
 os.makedirs(path)

download_and_extract_dataset(spam_filename, spam_url, path, "spam")

download_and_extract_dataset(ham_filename, ham_url, path, "ham")

In [152]:
def load_emails(directory, filename):
    
    with open(os.path.join(directory, filename), "rb") as f:
      
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [153]:

ham_filenames = [name for name in sorted(os.listdir("./data/ham")) if name != 'cmds']
spam_filenames = [name for name in sorted(os.listdir("./data/spam")) if name != 'cmds']

ham_emails = [load_emails("./data/ham", filename=name) for name in ham_filenames]
spam_emails = [load_emails("./data/spam", filename=name) for name in spam_filenames]

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))


In [154]:
len(spam_emails)

1897

In [155]:
ham= np.array(ham_emails,dtype=object)
spam= np.array(spam_emails,dtype=object)


In [156]:
X.shape, y.shape

((6098,), (6098,))

In [157]:
y

array([0, 0, 0, ..., 1, 1, 1])

# Data Preprocessing
 

Cette fonction prétraite le corps d'un email

In [158]:

def preprocessing(email_contents):

  # Convert all letters to lowercase
  email_contents=email_contents.lower()
  # Remove HTML tags
  email_contents = re.sub('<[^<>]+>', ' ', email_contents)
  # Normalize URLs
  email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)
   # Normalize email addresses
  email_contents = re.sub('\S+@\S+', 'emailaddr', email_contents)
   # Normalize numbers
  email_contents = re.sub('\d+', 'nombre', email_contents)
   # Normalize dollar signs
  email_contents = re.sub('\$', 'dollar', email_contents)
   # Stem words
  stemmer = PorterStemmer()
  words = re.findall('\w+', email_contents)
  stemmed_words = [stemmer.stem(word) for word in words]
  email_contents = ' '.join(stemmed_words)
 # Remove non-words and punctuation, replace white spaces with a single space
   # Replace non-word characters with a space
  email_contents = re.sub(r'\W+', ' ', email_contents)
    #Remove punctuation
  email_contents = email_contents.translate(str.maketrans('', '', string.punctuation)) 
    # Replace newlines and tabs with a space
  email_contents= re.sub(r'\n|\t', ' ', email_contents)
    
    # Normalize whitespace
  email_contents = re.sub(r'\s+', ' ', email_contents).strip()
  return email_contents


In [159]:
#Convert html to text
def html_to_text(html):

    email_content= re.sub(r'<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    email_content = re.sub(r'<a\s.*?>', ' HYPERLINK ', email_content, flags=re.M | re.S | re.I)
    email_content = re.sub(r'<.*?>', '', email_content, flags=re.M | re.S)
    email_content = re.sub(r'(\s*\n)+', '\n', email_content, flags=re.M | re.S)
    
    return unescape(email_content) 

#Convert email to texte (lisibe)
def email_to_text(email):
    
    html = None
    for entity in email.walk():

        #Some emails have multiple parts, each part is handled separately
        entity_type = entity.get_content_type()
        if not entity_type in ("text/plain", "text/html"):
            continue
        
        try:
            content = entity.get_content()
            #Sometimes this is impossible for encoding reasons
        except: 
            content = str(entity.get_payload())
        if entity_type == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_text(html)

In [161]:
##la fonction qui fait le preprocessig de tous les emails
def preprocess(X): 
  emails_process =[]

  for email_content in X:
    email_content=email_to_text(email_content) or " "
    email_content=preprocessing(email_content)
    emails_process.append(email_content)

  return emails_process

In [187]:
_X=preprocess(X)

Exemple pour voir le preprocessing 

In [192]:
print("\nAvant le preprocessing :\n",email_to_text(X[0]))


Avant le preprocessing :
     Date:        Tue, 20 Aug 2002 17:27:47 -0500
    From:        Chris Garrigues <cwg-exmh@DeepEddy.Com>
    Message-ID:  <1029882468.3116.TMDA@deepeddy.vircio.com>


  | I'm hoping that all people with no additional sequences will notice are
  | purely cosmetic changes.

Well, first, when exmh (the latest one with your changes) starts, I get...

can't read "flist(totalcount,unseen)": no such element in array
    while executing
"if {$flist(totalcount,$mhProfile(unseen-sequence)) > 0} {
	FlagInner spool iconspool labelup
    } else {
	FlagInner down icondown labeldown
    }"
    (procedure "Flag_MsgSeen" line 3)
    invoked from within
"Flag_MsgSeen"
    (procedure "MsgSeen" line 8)
    invoked from within
"MsgSeen $msgid"
    (procedure "MsgShow" line 12)
    invoked from within
"MsgShow $msgid"
    (procedure "MsgChange" line 17)
    invoked from within
"MsgChange 4862 show"
    invoked from within
"time [list MsgChange $msgid $show"
    (procedure "Msg_Ch

In [189]:
print("\nApres le preprocessing :\n",_X[0])


Apres le preprocessing :
 date tue nombr aug nombr nombr nombr nombr nombr from chri garrigu messag id i m hope that all peopl with no addit sequenc will notic are pure cosmet chang well first when exmh the latest one with your chang start i get can t read flist totalcount unseen no such element in array while execut if dollarflist totalcount dollarmhprofil unseen sequenc nombr flaginn spool iconspool labelup els flaginn down icondown labeldown procedur flagmsgseen line nombr invok from within flagmsgseen procedur msgseen line nombr invok from within msgseen dollarmsgid procedur msgshow line nombr invok from within msgshow dollarmsgid procedur msgchang line nombr invok from within msgchang nombr show invok from within time list msgchang dollarmsgid dollarshow procedur msgchang line nombr invok from within msgchang dollarmsg id dollarshow procedur msgshow line nombr invok from within msgshow cur eval bodi line nombr invok from within eval dollarmsgshowproc procedur folderchang line nom

#Vocabulary building
The vocabulary list is intended for spam emails only.

In [164]:
#Creation list_vocabulaire
def list_vocabulaire2(X,k):
  list=X.split()
  vocabulaire=[]
  v = collections.Counter(list)
  keys = v.keys() # récupérer les clés de v
  for key in keys:
   if v[key]>k :
    vocabulaire.append(key) #ajouter les mots qui se repetent plus q k fois dans la lste vocabulaire
  return vocabulaire

In [178]:
spam=_X[len(ham_emails):] #retourner les emails spam

In [179]:
vocab = spam
vocab = ''.join(vocab)
v = list_vocabulaire2(vocab,5) # pour le k nous avonss testé ses valeurs et le k=5 donne le meilleur résultat

In [180]:
len(v)

4631

In [168]:
'''f = open("Vocabulaire.txt", "w")
for word in v:
    
    f.write(word )
    f.write('\n')'''

'f = open("Vocabulaire.txt", "w")\nfor word in v:\n    \n    f.write(word )\n    f.write(\'\n\')'

# Extract features

## Représentation binaire des caractéristiques :

In [181]:
# Création d'un objet CountVectorizer avec le vocabulaire spécifié
count_vect = CountVectorizer(binary=True, vocabulary=v)

# Transformation des e-mails en une matrice binaire d'occurrences de mots
x_bin = count_vect.transform(_X).toarray()

# Affichage de la matrice binaire d'occurrences de mots
print(x_bin)

[[0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 1 1 1]]


spliting


In [182]:
x_bin_train, x_bin_test, y_bin_train,y_bin_test  = train_test_split(x_bin, y, random_state=0, test_size = 0.3)

In [183]:
x_bin_train.shape

(4268, 4631)

## Représentation des caractéristiques par comptage

In [174]:
# Création d'un objet CountVectorizer avec le vocabulaire spécifié
count_vect = CountVectorizer(binary=False, vocabulary=v)

# Transformation des e-mails en une matrice  d'occurrences de mots
x_comp = count_vect.transform(_X).toarray()

# Affichage de la matrice  d'occurrences de mots
print(x_comp)

[[  0  47   1 ...   0   0   0]
 [  0  40  12 ...   0   0   0]
 [  0  18   2 ...   0   0   0]
 ...
 [  0   2   7 ...   0   0   0]
 [  0   2   6 ...   0   0   0]
 [  0 107  16 ...   7   7   9]]


In [175]:
'''def is_matrix_null(matrix):
    for row in matrix:
        for element in row:
            if element != 0:
                return False
    return True
    
is_matrix_null(x_comp)'''

'def is_matrix_null(matrix):\n    for row in matrix:\n        for element in row:\n            if element != 0:\n                return False\n    return True\n    \nis_matrix_null(x_comp)'

In [176]:
x_comp.shape

(6098, 4631)

In [177]:
x_comp_train, x_comp_test, y_comp_train,y_comp_test  = train_test_split(x_comp, y, random_state=0, test_size = 0.3)

# Classification



## modele 1: SVM

### Représentation binaire des caractéristiques 

### Représentation  des caractéristiques  par comptage

## modele 2: RNN

### Représentation binaire des caractéristiques 

### Représentation  des caractéristiques  par comptage