# TP 3: Classification mails ham vs spam

In [1]:
#importation des librairies nécessaires
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

## 1. Chargement et visualisation des données

In [2]:
mails = pd.read_csv(r'spam_ham_dataset.xls')

In [3]:
mails.head(8)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1


In [4]:
mails.describe()

Unnamed: 0.1,Unnamed: 0,label_num
count,5171.0,5171.0
mean,2585.0,0.289886
std,1492.883452,0.453753
min,0.0,0.0
25%,1292.5,0.0
50%,2585.0,0.0
75%,3877.5,1.0
max,5170.0,1.0


## 2. Nettoyage des données

In [5]:
#vérification des valeurs manquantes
mails.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [6]:
#suppression de la première colonne
mails.drop("Unnamed: 0", axis=1, inplace=True)

mails

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


## 3. Prétraitement

In [7]:
vectorizer = CountVectorizer(stop_words='english', analyzer='word')

#récupération des valeurs de X et Y au niveau du dataframe
X = mails.loc[:, 'text']
y = mails.loc[:, 'label_num']

#découpage du jeu de données en trainning set et testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

#construction du vocabulaire; permettra de quitter d'un mot à un réel
vectorizer.fit(X_train)

#affichage du vocabulaire
#print(vectorizer.get_feature_names())

#longueur du vocabulaire
print(len(vectorizer.get_feature_names()))

#permet de transformer les mots en réels
#A chaque mail on associe le vecteur de réels correspondant
X_train_2 = vectorizer.transform(X_train)
X_train_2 = X_train_2.toarray()

X_test_2 = vectorizer.transform(X_test)
X_test_2 = X_test_2.toarray()

print(X_train_2)

print(X_train.shape)

44268
[[0 3 0 ... 0 0 0]
 [3 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(4136,)


## 4. Model

In [None]:
params={'C':[1, 2, 3], 'penalty':['l2'], 'solver':['liblinear', 'lbfgs']}

lr = LogisticRegression()

#permet de choisir les meilleurs paramètres à utiliser pour une régression logistique
clf = GridSearchCV(lr, params, cv=4)

clf.fit(X_train_2, y_train)

#il en ressort que les meilleurs paramètres sont C:1, penalty:l2, et solver: liblinear

In [8]:
#construction du modèle de régression linéaire avec les meilleurs paramètres trouvés
lr = LogisticRegression(C=1, penalty='l2', solver='liblinear')

#entraînement du modèle
lr.fit(X_train_2, y_train)

LogisticRegression(C=1, solver='liblinear')

## 5. Test

In [9]:
#test du modèle sur les données de test
print(lr.score(X_test_2, y_test))

0.9797101449275363


## 6. Deploiement

In [24]:
#fonction permettant de prédire si un mail est un ham ou un spam en indiquant le pourcentage de fiabilité

def prediction(cv:CountVectorizer,lr: LogisticRegression, mail):
    mail_2 = cv.transform([mail]).toarray()
    
    p = lr.predict_proba(mail_2.reshape(1, -1))[0]
    
    print("Ce mail est un ham à ",p[1],"%")
    print("Ce mail est un spam à",p[0],"%")

In [25]:
#test du déploiement
mail1 = '''Dear professor Alice, I'm writting to follow up on my email I sent earlier this week regarding a question I have about the topic assigment and exam. I look forward to hearing from you. Best regards, Michael Kumar'''

prediction(vectorizer, lr, mail1)

Ce mail est un ham à  0.6753793532492377 %
Ce mail est un spam à 0.3246206467507623 %


In [26]:
mail2='''We will give you $1,000 for sending an e-mail to your freinds. AB Mailing, Inc. is proud to anounce the start of a new contest. Each day until January, 31 1999, on lucky internet or AOL user whor forwards our advertisement to their friends will be randomly picked to receive $1,000! You could be the winner! Thank you for your time.'''
prediction(vectorizer, lr, mail2)

Ce mail est un ham à  0.19377426056711464 %
Ce mail est un spam à 0.8062257394328853 %
