In [177]:
# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

# Preprocessing
from imblearn.under_sampling import RandomUnderSampler #conda install conda-forge::imbalanced-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from imblearn.under_sampling import RandomUnderSampler 

# Pipeline and model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB

# Score of models
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score,f1_score, recall_score, balanced_accuracy_score, make_scorer, classification_report , roc_auc_score, ConfusionMatrixDisplay, classification_report,precision_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [178]:
#création d'une blacklist de mots de spam 
dfblacklist = pd.read_csv('spam_words.txt', header=None, on_bad_lines='skip' )
dfblacklist.rename(columns={0:'words'}, inplace=True)
dfblacklistList = dfblacklist['words'].tolist()

In [179]:
#fonction de mise en place du dataset 
def cree_df(url = "SMSSpamCollection.txt"):
    df = pd.read_csv(url, sep='\t', header=None )
    df.rename(columns={0:'type',1:'mail'}, inplace=True)
    return df

In [180]:
#fonction de préparation des données (preprocessing)
def prep(df): 

    #lowercase des message
    df['minuscule']=df['mail'].str.lower()

    #mise en place des tokens des message
    tokenizer = RegexpTokenizer(r"\b\w+\b|\d{2} \d{2} \d{2} \d{2} \d{2}")
    df['token'] = df['minuscule'].apply(lambda x: tokenizer.tokenize(x))

    #ajout d'un stopwords 
    stop = stopwords.words('english')
    df['without_stopwords']=df['token'].apply(lambda x: [word for word in x if word not in stop])

    #ajout d'un stemmer
    stemmer = PorterStemmer()
    df['PorterStemmer'] = df['without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
    
    #regroupement du traitement des données
    df['clean'] = df['without_stopwords'].apply(lambda x: " ".join(x))
    return df

In [181]:
#fonction d'ajout des features afin d'améliorer la précision des modèles
def features(df):
    
    #ajout d'une feature "longueur du message"
    df['len']=df['mail'].str.len()

    #ajout d'une feature "nombre de mots"
# df['nombre_mots']=df['mail'].str.split().str.len()
    df['nombre_mots']=df['token'].str.len()

    #ajout d'une feature permettant de vérifier si présence d'hypertexte
    pattern = r"http\S+|www.\S+"
    df['http']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)

    #ajout d'une feature permettant de vérifier la présence de chiffre 
    pattern = r"/^[\(]?[\+]?(\d{2}|\d{3})[\)]?[\s]?((\d{6}|\d{8})|(\d{3}[\*\.\-\s]){3}|(\d{2}[\*\.\-\s]){4}|(\d{4}[\*\.\-\s]){2})|\d{8}|\d{10}|\d{12}$/"
    df['phone']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)
    
    #ajout d'une feature permettant de vérifier la présence de mail
    #pattern = r"[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+(?:\.[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+)*@(?:[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?"
    #df['mail_compt']=df['mail'].apply(lambda x = 'True' : re.findall(pattern, x) else x = 'False')

    #ajout d'une feature permettant de vérifier la présence de mots blacklisté 
    df['blacklist']=df['token'].apply(lambda x: len([ word for word in x if word  in dfblacklistList]))
    return df

In [182]:
#fonction permettant la spéaration de la target des autres colonnes (création X et y)

def spliteur(df):
    X = df.drop(columns = ['type'], axis=1)
    y = df['type']
    rus = RandomUnderSampler(random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    return train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)

In [183]:
dfModel = cree_df("SMSSpamCollection.txt")
dfModel = prep(dfModel)
dfModel = features(dfModel)
X_train, X_test, y_train, y_test = spliteur(dfModel)

In [184]:
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = MultinomialNB()):

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [185]:
#fonction création du modèle 
def ModelCreateur(X_train,y_train,X_test,param1 = None,param2=None,param3=None):
    column_num  = ['len','nombre_mots','blacklist']
    column_bool = ['http','phone']
    
    #Transformation des variables texte
    transfo_text_TFid = Pipeline(steps=[
        ('Tfid', TfidfVectorizer(lowercase=False, decode_error='ignore', analyzer='char_wb', ngram_range=(2, 2)))
        
    ])

#Application des étapes sur tout le dataset
    if ClfSwitcher() == "ComplementNB()" or "MultinomialNB()":
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
            ('Scaler&data',MinMaxScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    else : 
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
            ('Scaler&data',RobustScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    
    #relie l'algorithme avec le modèle
    pipe = Pipeline([
    ('vectorizer', preparation),
    ('clf', ClfSwitcher()),
    ])


    parameters = [
    {
        'clf__estimator': [LogisticRegression()], # SVM if hinge loss / logreg if log loss
        'clf__estimator__penalty': ['l2'],
        'clf__estimator__tol': [0.0001],
        'clf__estimator__C': [0.1],
    },
    {
        'clf__estimator': [KNeighborsClassifier()],
        'clf__estimator__n_neighbors': [3,7],
        'clf__estimator__weights': ['uniform'],
    },
      {
        'clf__estimator': [ComplementNB()],
        'clf__estimator__alpha': [10],
        'clf__estimator__alpha': [10],
    },
      {
        'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': [0.0,1.0,10],
    },
    {
        'clf__estimator': [BernoulliNB()],
        'clf__estimator__alpha': [0.0,1.0,10],
    },
     {
        'clf__estimator': [SVC()],
        'clf__estimator__kernel': ['rbf'],
        'clf__estimator__C': [10],
        'clf__estimator__degree':[3],
    },
     {
        'clf__estimator': [RidgeClassifier()],
        'clf__estimator__alpha': [0.1,1.0,10],
        'clf__estimator__max_iter': [None],
        'clf__estimator__tol': [0.0001],
    },
     {
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [200,300],
        'clf__estimator__max_depth': [10,30],
    },
      {
        'clf__estimator': [DecisionTreeClassifier()],
        'clf__estimator__max_depth': [10,30],
    },

    ]


    label_score  = make_scorer(precision_score, pos_label='spam')
    Scoring_list = {'Myrecall':label_score, 'Accuracy':'precision', 'roc':'roc_auc',}
    scoring = {'accuracy' : make_scorer(accuracy_score,  pos_label='spam'), 
       'recall' : make_scorer(recall_score, pos_label='spam' ), 
       'f1_score' : make_scorer(f1_score, pos_label='spam')}

    #custom_score = make_scorer(fbeta_score,pos_label="spam")
    gscv = GridSearchCV(pipe, parameters, cv=5, verbose=1 ,scoring = scoring , refit= label_score , n_jobs= -1)
    Bow = gscv.fit(X_train, y_train)
    y_pred = gscv.predict(X_test)
    #print(y_pred)
    print(gscv.best_score_)
    print(gscv.best_params_)
    #print(classification_report(y_test, y_pred))
    return pd.DataFrame(Bow.cv_results_)

In [186]:
dfClassifier = ModelCreateur(X_train, y_train,X_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 nan nan]


TypeError: _BaseScorer.__call__() missing 2 required positional arguments: 'X' and 'y_true'

In [None]:
dfClassifier

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__estimator,param_clf__estimator__C,param_clf__estimator__penalty,param_clf__estimator__tol,param_clf__estimator__n_neighbors,param_clf__estimator__weights,...,param_clf__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063862,0.006536,0.0209,0.003244,LogisticRegression(),0.1,l2,0.0001,,,...,,"{'clf__estimator': LogisticRegression(), 'clf_...",0.988506,0.981818,1.0,1.0,0.990099,0.992085,0.007035,3
1,0.056815,0.002525,1.083421,0.085315,KNeighborsClassifier(),,,,3.0,uniform,...,,"{'clf__estimator': KNeighborsClassifier(), 'cl...",0.971154,0.98374,0.959016,0.95,0.934959,0.959774,0.016827,15
2,0.074717,0.006382,1.104941,0.092073,KNeighborsClassifier(),,,,7.0,uniform,...,,"{'clf__estimator': KNeighborsClassifier(), 'cl...",0.990566,0.991736,0.991736,0.956897,0.966387,0.979464,0.014864,10
3,0.107641,0.021361,0.032168,0.005117,ComplementNB(),,,,,,...,,"{'clf__estimator': ComplementNB(), 'clf__estim...",0.962963,0.930769,0.976,0.897638,0.95082,0.943638,0.027401,17
4,0.100612,0.015037,0.031465,0.005871,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.876033,0.959677,0.897059,0.876923,0.933884,0.908715,0.033001,20
5,0.105538,0.024837,0.031581,0.004303,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.979167,0.991379,1.0,0.954955,0.981651,0.98143,0.015165,7
6,0.099137,0.017529,0.033184,0.004536,MultinomialNB(),,,,,,...,,"{'clf__estimator': MultinomialNB(), 'clf__esti...",0.945455,0.930769,0.976,0.897638,0.95082,0.940136,0.025778,19
7,0.090036,0.009193,0.030779,0.003415,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.9375,0.968,0.924242,0.933884,0.95,0.942725,0.015092,18
8,0.117224,0.021274,0.035607,0.007587,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.980952,0.983333,0.991525,0.990826,0.933884,0.976104,0.021507,12
9,0.093592,0.010307,0.036898,0.009033,BernoulliNB(),,,,,,...,,"{'clf__estimator': BernoulliNB(), 'clf__estima...",0.979798,0.991304,0.990991,0.990291,0.973214,0.98512,0.007342,5


In [None]:
#ModelCreateurV2(X_train, y_train,X_test,y_test)

In [None]:
"""def ModelPrep(pipe,model,param1 = None,param2=None,param3=None):
    parameters = { 'LogisticRegression()' : {'model__penalty': param1  if param1 else ['l2'] , 'model__tol': param2 if param2 else [0.0001], 'model__C': param3 if param3 else[1.0]},
                   'KNeighborsClassifier()' : {'model__n_neighbors': param1 if param1 else 5 , 'model__weights': param2 if param2 else 'uniform'},
                   'ComplementNB()': {'model__alpha': param1  if param1 else 1.0},
                   'SVC()' : {'model__kernel': param1  if param1 else 'rbf', 'model__C': param2 if param2 else [1.0], 'model__degree': param3 if param3 else 3},
                   'RidgeClassifier()' : {'model__alpha': param1  if param1 else 1.0, 'model__max_iter': param2 if param2 else None, 'model__tol': param3 if param3 else [0.0001]},
                   'RandomForestClassifier()' : {'model__n_estimators': param1  if param1 else 100, 'model__max_depth': param2 if param2 else None},
                   'DecisionTreeClassifier()' : {'model__max_depth': param1 if param1 else None}
                    }
        


    grid = GridSearchCV(pipe, parameters[str(model)], cv = 5, n_jobs =-1, verbose = 1)
    grid.fit(X_train, y_train)
    grid.best_score_
    grid.best_params_"""

"def ModelPrep(pipe,model,param1 = None,param2=None,param3=None):\n    parameters = { 'LogisticRegression()' : {'model__penalty': param1  if param1 else ['l2'] , 'model__tol': param2 if param2 else [0.0001], 'model__C': param3 if param3 else[1.0]},\n                   'KNeighborsClassifier()' : {'model__n_neighbors': param1 if param1 else 5 , 'model__weights': param2 if param2 else 'uniform'},\n                   'ComplementNB()': {'model__alpha': param1  if param1 else 1.0},\n                   'SVC()' : {'model__kernel': param1  if param1 else 'rbf', 'model__C': param2 if param2 else [1.0], 'model__degree': param3 if param3 else 3},\n                   'RidgeClassifier()' : {'model__alpha': param1  if param1 else 1.0, 'model__max_iter': param2 if param2 else None, 'model__tol': param3 if param3 else [0.0001]},\n                   'RandomForestClassifier()' : {'model__n_estimators': param1  if param1 else 100, 'model__max_depth': param2 if param2 else None},\n                   'Decis

In [None]:
#fonction permettant de connaître le score de notre modèle
def AfficherScores(y_test, y_pred,model,x):
    
    #affiche l'accuracy du modèle
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #affiche la classification report du modèle
    print(classification_report(y_test, y_pred))
    #affiche les graphiques du modèle
    """plt.hist(model.decision_function(x), bins=50)
    plt.show()"""

In [None]:
# fonction qui affiche la matrice de confusion du modèle
def matrixconf(y_test,y_pred):
    #affiche la matrice de confusion du modèle
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
#fonction permettant de tester le modèle
def testModel(sms,model):
    #mise en place d'un input
    input_sms     = [sms]
    #transformation en dataframe afin de l'utiliser correctement
    df_sms        = pd.DataFrame(input_sms)
    df_sms.rename(columns={0:'mail'}, inplace=True)
    #mise en place du preprocessing
    df_sms        = prep(df_sms)
    #mise en place des feature
    df_sms        = features(df_sms)
    #prediction 
    result = model.predict(df_sms)
    return result

In [None]:
#dfModel[['type', 'phone']].boxplot(column='phone', by='type')
#dfModel[['type', 'http']].boxplot(column='http', by='type')
#dfModel[['type', 'nombre_mots']].boxplot(column='nombre_mots', by='type')
#dfModel[['type', 'len']].boxplot(column='len', by='type')
#dfModel[['type', 'blacklist']].boxplot(column='blacklist', by='type')


In [None]:

#pivot_table = dfModel[['type', 'phone']].pivot_table(index='type', columns='phone', aggfunc=len, fill_value=0)
#pivot_table.plot(kind='bar', stacked=True)

#plt.title('Nombre de Numéro de téléphone dans les SMS')
#plt.xlabel('SMS')
#plt.ylabel('Nombre')
#plt.show()

In [None]:
"""pivot_table = dfModel[['type', 'http']].pivot_table(index='type', columns='http', aggfunc=len, fill_value=0)
pivot_table.plot(kind='bar', stacked=True)

plt.title('Nombre de liens hypertextes dans les SMS')
plt.xlabel('SMS')
plt.ylabel('Nombre')
plt.show()"""

"pivot_table = dfModel[['type', 'http']].pivot_table(index='type', columns='http', aggfunc=len, fill_value=0)\npivot_table.plot(kind='bar', stacked=True)\n\nplt.title('Nombre de liens hypertextes dans les SMS')\nplt.xlabel('SMS')\nplt.ylabel('Nombre')\nplt.show()"

In [None]:
#mise en place des modèles
classifier1 = LogisticRegression()
classifier2 = KNeighborsClassifier()

classifier3 = ComplementNB()                                        
classifier4 = MultinomialNB()                                     
classifier5 = BernoulliNB()

classifier6 = SVC()                                                       

classifier7 = RidgeClassifier()          
classifier8 = RandomForestClassifier() 
classifier9 = DecisionTreeClassifier()                             

list_model = [classifier1,classifier2,classifier3,classifier4,classifier5,classifier6,classifier7,classifier8,classifier9]

In [None]:
"""for i in list_model:
    model_lm=ModelCreateur(X_train, y_train, i)
    ModelPrep(model_lm,i)
    print(i,':',testModel("Low-cost 08707509020",model_lm))
    # print('model utilisé:', i)
    y_pred = model_lm.predict(X_test)
    matrixconf(y_test,y_pred)
    AfficherScores(y_test, y_pred,model_lm,X_test)
    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)"""

'for i in list_model:\n    model_lm=ModelCreateur(X_train, y_train, i)\n    ModelPrep(model_lm,i)\n    print(i,\':\',testModel("Low-cost 08707509020",model_lm))\n    # print(\'model utilisé:\', i)\n    y_pred = model_lm.predict(X_test)\n    matrixconf(y_test,y_pred)\n    AfficherScores(y_test, y_pred,model_lm,X_test)\n    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)'