In [4]:
import numpy as np
import pandas as pandas
from keras.utils.np_utils import to_categorical
from keras.datasets import mnist
from keras.models import Sequential
from keras import layers
import math

from numpy import loadtxt

# récupération des données
dataset = pandas.read_csv('Uses_Cases/Spam/Spam detection - For model creation.csv', delimiter=';')
dataset2 = loadtxt('Uses_Cases/Spam/Spam detection - For prediction.csv', delimiter=',',skiprows=1)
dataset3 = pandas.read_csv('Uses_Cases/Spam/Spam detection - For prediction.csv', delimiter=',')
train_labels = dataset['GOAL-Spam'].replace("No",0).replace("Yes",1).to_numpy()
train_images = dataset.drop(columns=['GOAL-Spam']).to_numpy()

test_images = dataset2[:,:-1]
test_labels = dataset2[:,-1]

# préparation des données : « aplatir » données en vecteurs de 57*57 et normaliser
train_images = train_images.reshape(len(train_images), 57)
train_images = train_images.astype('float32')/255
test_images = test_images.reshape(len(test_images), 57)
test_images = test_images.astype('float32')/255
# coder les labels dans le format one-hot
train_label = to_categorical(train_labels,2)# 2 est le nombre de classes
test_label = to_categorical(test_labels,2)

In [11]:
## MODELE 1
# architecture du réseau: sans couche cachée de 512 neurones et sans dropout.
network = Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(57,)))
network.add(layers.Dense(2,activation='softmax'))

# compilation du modèle de classification multiclasse
'binary_crossentropy'
network.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])

# entrainement du réseau : appel de la fonction fit
network.fit(train_images, train_label, epochs=5, batch_size=128, verbose = 1,
validation_data=(test_images,test_label))
# évaluation du modèle
test_loss, test_acc = network.evaluate(test_images, test_label)
# affichage du résultat
print ('Acccuracy = ', test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Acccuracy =  0.7268446087837219


In [12]:
## MODELE 2
# architecture du réseau: de 512 neurones avec une couche cachée de 258 neurones
network = Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(57,)))
network.add(layers.Dense(258, activation='relu'))
network.add(layers.Dense(2,activation='softmax'))

network.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])

network.fit(train_images, train_label, epochs=5, batch_size=128, verbose = 1,
validation_data=(test_images,test_label))
test_loss, test_acc = network.evaluate(test_images, test_label)
print ('Acccuracy = ', test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Acccuracy =  0.8178963661193848


In [2]:
## MODELE 3
# initialisation des variables les plus importantes
nblignes = len(dataset)
counts = dataset["GOAL-Spam"].value_counts()
nbYes = counts["Yes"]
nbNo = counts["No"]
pYes = nbYes/nblignes
pNo = nbNo/nblignes

# calcul des espérances et variances
outLabels =  ["Yes","No"]
labelsValues = {}
for r in outLabels:
    columns = dataset.columns.to_list()[1:]
    tmp = dataset.where(dataset["GOAL-Spam"]== r)
    labelsValues[r] = {}
    for c in columns:
        µ = 0
        for x in tmp[c]:
            µ+= x if not pandas.isnull(x) else 0
        µ/= counts[r]
        ss = 0
        for x in tmp[c]:
            ss += (x if not pandas.isnull(x) else 0-µ)**2
        ss/= counts[r]-1
        s = ss**0.5
        if s==0: s = 0.0000000000000000001
        labelsValues[r][c] = {"µ": µ,"s": s}

In [3]:
# utilisation des espérances et variances dans la loi normale et calcul des probabilités conditionnelles
result = [0]*len(dataset3)
for j in range(len(dataset3)):
    val = dataset3.iloc[j]
    dic = {}
    for r in ["Yes","No"]:
        columns = dataset3.columns.to_list()[:-1]
        fxs = [1]*(len(columns))
        for i,c in enumerate(columns):
            y = val[c]
            µ = labelsValues[r][c]["µ"]
            s = labelsValues[r][c]["s"]
            fxs[i] = (1/(s*(2*math.pi)**0.5)) * math.exp(-0.5  *  ((y-µ)/s)**2)
        p = 1
        for x in fxs:
            p*=x
        dic[r] = p
    tmp = [0,0]
    for x in dic:
        if dic[x] > tmp[1]:
            tmp[1] = dic[x]
            tmp[0] = x
    result[j] = tmp[0]

In [30]:
# évaluation du modèle et affichage du résultat
tmp = dataset3["Spam"].replace(0,"No").replace(1,"Yes")
x = 0
for i,r in enumerate(result):
    if r==tmp[i]:
        x+=1
x/=len(tmp)
print(x)

0.7660910518053375
