In [15]:
# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

# Preprocessing
from imblearn.under_sampling import RandomUnderSampler #conda install conda-forge::imbalanced-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from imblearn.under_sampling import RandomUnderSampler 

# Pipeline and model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB

# Score of models
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score,f1_score, recall_score, balanced_accuracy_score, make_scorer, classification_report , roc_auc_score, ConfusionMatrixDisplay, classification_report,precision_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#création d'une blacklist de mots de spam 
dfblacklist = pd.read_csv('spam_words.txt', header=None, on_bad_lines='skip' )
dfblacklist.rename(columns={0:'words'}, inplace=True)
dfblacklistList = dfblacklist['words'].tolist()

In [17]:
#fonction de mise en place du dataset 
def cree_df(url = "SMSSpamCollection.txt"):
    df = pd.read_csv(url, sep='\t', header=None )
    df.rename(columns={0:'type',1:'mail'}, inplace=True)
    return df

In [18]:
#fonction de préparation des données (preprocessing)
def prep(df): 

    #lowercase des message
    df['minuscule']=df['mail'].str.lower()

    #mise en place des tokens des message
    tokenizer = RegexpTokenizer(r"\b\w+\b|\d{2} \d{2} \d{2} \d{2} \d{2}")
    df['token'] = df['minuscule'].apply(lambda x: tokenizer.tokenize(x))

    #ajout d'un stopwords 
    stop = stopwords.words('english')
    df['without_stopwords']=df['token'].apply(lambda x: [word for word in x if word not in stop])

    #ajout d'un stemmer
    stemmer = PorterStemmer()
    df['PorterStemmer'] = df['without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
    
    #regroupement du traitement des données
    df['clean'] = df['without_stopwords'].apply(lambda x: " ".join(x))
    return df

In [19]:
#fonction d'ajout des features afin d'améliorer la précision des modèles
def features(df):
    
    #ajout d'une feature "longueur du message"
    df['len']=df['mail'].str.len()

    #ajout d'une feature "nombre de mots"
# df['nombre_mots']=df['mail'].str.split().str.len()
    df['nombre_mots']=df['token'].str.len()

    #ajout d'une feature permettant de vérifier si présence d'hypertexte
    pattern = r"http\S+|www.\S+"
    df['http']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)

    #ajout d'une feature permettant de vérifier la présence de chiffre 
    pattern = r"/^[\(]?[\+]?(\d{2}|\d{3})[\)]?[\s]?((\d{6}|\d{8})|(\d{3}[\*\.\-\s]){3}|(\d{2}[\*\.\-\s]){4}|(\d{4}[\*\.\-\s]){2})|\d{8}|\d{10}|\d{12}$/"
    df['phone']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)
    
    #ajout d'une feature permettant de vérifier la présence de mail
    #pattern = r"[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+(?:\.[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+)*@(?:[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?"
    #df['mail_compt']=df['mail'].apply(lambda x = 'True' : re.findall(pattern, x) else x = 'False')

    #ajout d'une feature permettant de vérifier la présence de mots blacklisté 
    df['blacklist']=df['token'].apply(lambda x: len([ word for word in x if word  in dfblacklistList]))
    return df

In [20]:
#fonction permettant la spéaration de la target des autres colonnes (création X et y)

def spliteur(df):
    X = df.drop(columns = ['type'], axis=1)
    y = df['type']
    rus = RandomUnderSampler(random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    return train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)

In [21]:
dfModel = cree_df("SMSSpamCollection.txt")
dfModel = prep(dfModel)
dfModel = features(dfModel)
X_train, X_test, y_train, y_test = spliteur(dfModel)

In [22]:
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = MultinomialNB()):

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [23]:
#fonction création du modèle 
def ModelCreateur(X_train,y_train,X_test,score,param1 = None,param2=None,param3=None):
    column_num  = ['len','nombre_mots','blacklist']
    column_bool = ['http','phone']
    
    #Transformation des variables texte
    transfo_text_TFid = Pipeline(steps=[
        ('Tfid', TfidfVectorizer(lowercase=False, decode_error='ignore', analyzer='char_wb', ngram_range=(2, 2)))
        
    ])

#Application des étapes sur tout le dataset
    if ClfSwitcher() == "ComplementNB()" or "MultinomialNB()":
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
            ('Scaler&data',MinMaxScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    else : 
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
            ('Scaler&data',RobustScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    
    #relie l'algorithme avec le modèle
    pipe = Pipeline([
    ('vectorizer', preparation),
    ('clf', ClfSwitcher()),
    ])


    parameters = [
    {
        'clf__estimator': [LogisticRegression()], # SVM if hinge loss / logreg if log loss
        'clf__estimator__penalty': ['l2'],
        'clf__estimator__tol': [0.0001],
        'clf__estimator__C': [0.1],
    },
    {
        'clf__estimator': [KNeighborsClassifier()],
        'clf__estimator__n_neighbors': [3,7],
        'clf__estimator__weights': ['uniform'],
    },
      {
        'clf__estimator': [ComplementNB()],
        'clf__estimator__alpha': [10],
        'clf__estimator__alpha': [10],
    },
      {
        'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': [0.0,1.0,10],
    },
    {
        'clf__estimator': [BernoulliNB()],
        'clf__estimator__alpha': [0.0,1.0,10],
    },
     {
        'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['rbf'],
        'clf__estimator__C': [10],
        'clf__estimator__degree':[3],
    },
     {
        'clf__estimator': [RidgeClassifier()],
        'clf__estimator__alpha': [0.1,1.0,10],
        'clf__estimator__max_iter': [None],
        'clf__estimator__tol': [0.0001],
    },
     {
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [200,300],
        'clf__estimator__max_depth': [10,30],
    },
      {
        'clf__estimator': [DecisionTreeClassifier()],
        'clf__estimator__max_depth': [10,30],
    },

    ]



    gscv  = GridSearchCV(pipe, parameters, cv=5, verbose=1 ,scoring = score  , n_jobs= -1)
    
    Bow= gscv.fit(X_train, y_train)
    """y_pred = gscv.predict(X_test)
    #print(y_pred)
    print(gscv.best_score_)
    print(gscv.best_params_)
    #print(classification_report(y_test, y_pred))"""
    return pd.DataFrame(Bow.cv_results_)

In [24]:
precision_scoring       = make_scorer(precision_score, pos_label='spam')
accuracy_scoring        = make_scorer(accuracy_score,  pos_label='spam')
f1_scoring              = make_scorer(f1_score, pos_label='spam')
recall_scoring          = make_scorer(recall_score,  pos_label='spam')

df_precision_scoring   = ModelCreateur(X_train, y_train,X_test,precision_scoring)
df_accuracy_scoring    = ModelCreateur(X_train, y_train,X_test,'accuracy')
df_f1_scoring          = ModelCreateur(X_train, y_train,X_test,f1_scoring)
df_recall_scoring      = ModelCreateur(X_train, y_train,X_test,recall_scoring)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


 nan nan]


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [41]:
df_recall_scoring

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_clf__estimator__C,param_clf__estimator__penalty,param_clf__estimator__tol,param_clf__estimator__n_neighbors,param_clf__estimator__weights,...,param_clf__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067565,0.013412,0.018495,0.00669,LogisticRegression(),0.1,l2,0.0001,,,...,,"{'clf__estimator': LogisticRegression(), 'clf_...",0.781818,0.870968,0.776,0.815126,0.833333,0.815449,0.034899,20
1,0.066095,0.011421,1.088862,0.093042,KNeighborsClassifier(),,,,3.0,uniform,...,,"{'clf__estimator': KNeighborsClassifier(), 'cl...",0.918182,0.975806,0.936,0.957983,0.958333,0.949261,0.020024,9
2,0.078058,0.004223,0.995519,0.011767,KNeighborsClassifier(),,,,7.0,uniform,...,,"{'clf__estimator': KNeighborsClassifier(), 'cl...",0.954545,0.967742,0.96,0.932773,0.958333,0.954679,0.011766,6
3,0.074749,0.003027,0.034553,0.003919,ComplementNB(alpha=10),,,,,,...,,"{'clf__estimator': ComplementNB(alpha=10), 'cl...",0.945455,0.975806,0.976,0.957983,0.966667,0.964382,0.011575,1
4,0.080726,0.006238,0.028732,0.00839,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.963636,0.959677,0.976,0.957983,0.941667,0.959793,0.011039,5
5,0.089495,0.007454,0.034076,0.013735,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.854545,0.927419,0.904,0.890756,0.891667,0.893678,0.023618,17
6,0.088337,0.010429,0.031672,0.011884,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.945455,0.975806,0.976,0.957983,0.966667,0.964382,0.011575,1
7,0.084834,0.00595,0.028678,0.00871,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.954545,0.975806,0.976,0.94958,0.95,0.961186,0.012142,4
8,0.101139,0.019347,0.028866,0.002931,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.936364,0.951613,0.936,0.907563,0.941667,0.934641,0.014666,14
9,0.118423,0.014717,0.036388,0.011485,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.881818,0.919355,0.88,0.857143,0.908333,0.88933,0.0221,19


In [26]:
#ModelCreateurV2(X_train, y_train,X_test,y_test)

In [27]:
"""def ModelPrep(pipe,model,param1 = None,param2=None,param3=None):
    parameters = { 'LogisticRegression()' : {'model__penalty': param1  if param1 else ['l2'] , 'model__tol': param2 if param2 else [0.0001], 'model__C': param3 if param3 else[1.0]},
                   'KNeighborsClassifier()' : {'model__n_neighbors': param1 if param1 else 5 , 'model__weights': param2 if param2 else 'uniform'},
                   'ComplementNB()': {'model__alpha': param1  if param1 else 1.0},
                   'SVC()' : {'model__kernel': param1  if param1 else 'rbf', 'model__C': param2 if param2 else [1.0], 'model__degree': param3 if param3 else 3},
                   'RidgeClassifier()' : {'model__alpha': param1  if param1 else 1.0, 'model__max_iter': param2 if param2 else None, 'model__tol': param3 if param3 else [0.0001]},
                   'RandomForestClassifier()' : {'model__n_estimators': param1  if param1 else 100, 'model__max_depth': param2 if param2 else None},
                   'DecisionTreeClassifier()' : {'model__max_depth': param1 if param1 else None}
                    }
        


    grid = GridSearchCV(pipe, parameters[str(model)], cv = 5, n_jobs =-1, verbose = 1)
    grid.fit(X_train, y_train)
    grid.best_score_
    grid.best_params_"""

"def ModelPrep(pipe,model,param1 = None,param2=None,param3=None):\n    parameters = { 'LogisticRegression()' : {'model__penalty': param1  if param1 else ['l2'] , 'model__tol': param2 if param2 else [0.0001], 'model__C': param3 if param3 else[1.0]},\n                   'KNeighborsClassifier()' : {'model__n_neighbors': param1 if param1 else 5 , 'model__weights': param2 if param2 else 'uniform'},\n                   'ComplementNB()': {'model__alpha': param1  if param1 else 1.0},\n                   'SVC()' : {'model__kernel': param1  if param1 else 'rbf', 'model__C': param2 if param2 else [1.0], 'model__degree': param3 if param3 else 3},\n                   'RidgeClassifier()' : {'model__alpha': param1  if param1 else 1.0, 'model__max_iter': param2 if param2 else None, 'model__tol': param3 if param3 else [0.0001]},\n                   'RandomForestClassifier()' : {'model__n_estimators': param1  if param1 else 100, 'model__max_depth': param2 if param2 else None},\n                   'Decis

In [28]:
#fonction permettant de connaître le score de notre modèle
def AfficherScores(y_test, y_pred,model,x):
    
    #affiche l'accuracy du modèle
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #affiche la classification report du modèle
    print(classification_report(y_test, y_pred))
    #affiche les graphiques du modèle
    """plt.hist(model.decision_function(x), bins=50)
    plt.show()"""

In [29]:
# fonction qui affiche la matrice de confusion du modèle
def matrixconf(y_test,y_pred):
    #affiche la matrice de confusion du modèle
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [30]:
#fonction permettant de tester le modèle
def testModel(sms,model):
    #mise en place d'un input
    input_sms     = [sms]
    #transformation en dataframe afin de l'utiliser correctement
    df_sms        = pd.DataFrame(input_sms)
    df_sms.rename(columns={0:'mail'}, inplace=True)
    #mise en place du preprocessing
    df_sms        = prep(df_sms)
    #mise en place des feature
    df_sms        = features(df_sms)
    #prediction 
    result = model.predict(df_sms)
    return result

In [31]:
#dfModel[['type', 'phone']].boxplot(column='phone', by='type')
#dfModel[['type', 'http']].boxplot(column='http', by='type')
#dfModel[['type', 'nombre_mots']].boxplot(column='nombre_mots', by='type')
#dfModel[['type', 'len']].boxplot(column='len', by='type')
#dfModel[['type', 'blacklist']].boxplot(column='blacklist', by='type')


In [32]:

#pivot_table = dfModel[['type', 'phone']].pivot_table(index='type', columns='phone', aggfunc=len, fill_value=0)
#pivot_table.plot(kind='bar', stacked=True)

#plt.title('Nombre de Numéro de téléphone dans les SMS')
#plt.xlabel('SMS')
#plt.ylabel('Nombre')
#plt.show()

In [33]:
"""pivot_table = dfModel[['type', 'http']].pivot_table(index='type', columns='http', aggfunc=len, fill_value=0)
pivot_table.plot(kind='bar', stacked=True)

plt.title('Nombre de liens hypertextes dans les SMS')
plt.xlabel('SMS')
plt.ylabel('Nombre')
plt.show()"""

"pivot_table = dfModel[['type', 'http']].pivot_table(index='type', columns='http', aggfunc=len, fill_value=0)\npivot_table.plot(kind='bar', stacked=True)\n\nplt.title('Nombre de liens hypertextes dans les SMS')\nplt.xlabel('SMS')\nplt.ylabel('Nombre')\nplt.show()"

In [34]:
#mise en place des modèles
classifier1 = LogisticRegression()
classifier2 = KNeighborsClassifier()

classifier3 = ComplementNB()                                        
classifier4 = MultinomialNB()                                     
classifier5 = BernoulliNB()

classifier6 = SVC()                                                       

classifier7 = RidgeClassifier()          
classifier8 = RandomForestClassifier() 
classifier9 = DecisionTreeClassifier()                             

list_model = [classifier1,classifier2,classifier3,classifier4,classifier5,classifier6,classifier7,classifier8,classifier9]

In [35]:
"""for i in list_model:
    model_lm=ModelCreateur(X_train, y_train, i)
    ModelPrep(model_lm,i)
    print(i,':',testModel("Low-cost 08707509020",model_lm))
    # print('model utilisé:', i)
    y_pred = model_lm.predict(X_test)
    matrixconf(y_test,y_pred)
    AfficherScores(y_test, y_pred,model_lm,X_test)
    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)"""

'for i in list_model:\n    model_lm=ModelCreateur(X_train, y_train, i)\n    ModelPrep(model_lm,i)\n    print(i,\':\',testModel("Low-cost 08707509020",model_lm))\n    # print(\'model utilisé:\', i)\n    y_pred = model_lm.predict(X_test)\n    matrixconf(y_test,y_pred)\n    AfficherScores(y_test, y_pred,model_lm,X_test)\n    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)'