In [52]:
# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from imblearn.under_sampling import RandomUnderSampler 

# Pipeline and model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB

# Score of models
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RED94\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
#création d'une blacklist de mots de spam 
dfblacklist = pd.read_csv('blacklist.txt', header=None )
dfblacklist.rename(columns={0:'words'}, inplace=True)
dfblacklistList = dfblacklist['words'].tolist()

In [54]:
#fonction de mise en place du dataset 
def cree_df(url = "SMSSpamCollection.txt"):
    df = pd.read_csv(url, sep='\t', header=None )
    df.rename(columns={0:'type',1:'mail'}, inplace=True)
    return df

In [55]:
#fonction de préparation des données (preprocessing)
def prep(df): 

    #lowercase des message
    df['minuscule']=df['mail'].str.lower()

    #mise en place des tokens des message
    tokenizer = RegexpTokenizer(r"\b\w+\b|\d{2} \d{2} \d{2} \d{2} \d{2}")
    df['token'] = df['minuscule'].apply(lambda x: tokenizer.tokenize(x))

    #ajout d'un stopwords 
    stop = stopwords.words('english')
    df['without_stopwords']=df['token'].apply(lambda x: [word for word in x if word not in stop])

    #ajout d'un stemmer
    stemmer = PorterStemmer()
    df['PorterStemmer'] = df['without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
    
    #regroupement du traitement des données
    df['clean'] = df['without_stopwords'].apply(lambda x: " ".join(x))
    return df

In [56]:
#fonction d'ajout des features afin d'améliorer la précision des modèles
def features(df):
    
    #ajout d'une feature "longueur du message"
    df['len']=df['mail'].str.len()

    #ajout d'une feature "nombre de mots"
# df['nombre_mots']=df['mail'].str.split().str.len()
    df['nombre_mots']=df['token'].str.len()

    #ajout d'une feature permettant de vérifier si présence d'hypertexte
    pattern = r"http\S+"
    df['http']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)

    #ajout d'une feature permettant de vérifier la présence de chiffre 
    pattern = r"/^[\(]?[\+]?(\d{2}|\d{3})[\)]?[\s]?((\d{6}|\d{8})|(\d{3}[\*\.\-\s]){3}|(\d{2}[\*\.\-\s]){4}|(\d{4}[\*\.\-\s]){2})|\d{8}|\d{10}|\d{12}$/"
    df['phone']=df['mail'].apply(lambda x : True if re.search(pattern, x) else False)
    
    #ajout d'une feature permettant de vérifier la présence de mail
    #pattern = r"[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+(?:\.[-A-Za-z0-9!#$%&'*+/=?^_`{|}~]+)*@(?:[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[-A-Za-z0-9]*[A-Za-z0-9])?"
    #df['mail_compt']=df['mail'].apply(lambda x = 'True' : re.findall(pattern, x) else x = 'False')

    #ajout d'une feature permettant de vérifier la présence de mots blacklisté 
    df['blacklist']=df['token'].apply(lambda x: [ word for word in x if word  in dfblacklistList])
    return df

In [57]:
from imblearn.under_sampling import RandomUnderSampler #conda install conda-forge::imbalanced-learn

In [58]:
#fonction permettant la spéaration de la target des autres colonnes (création X et y)

def spliteur(df):
    X = df.drop(columns = ['type'], axis=1)
    y = df['type']
    rus = RandomUnderSampler(random_state=42)
    X_res, y_res = rus.fit_resample(X, y)
    #return train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
    return train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)

In [59]:
#fonction création du modèle 
def ModelCreateur(X_train, y_train, classifier):

    column_num  = ['len','nombre_mots']
    column_bool = ['http','phone']
    
    #Transformation des variables texte
    transfo_text_TFid = Pipeline(steps=[
        ('Tfid', TfidfVectorizer(lowercase=False, decode_error='ignore', analyzer='char_wb', ngram_range=(2, 2)))
        
    ])

#Application des étapes sur tout le dataset
    if classifier == "ComplementNB()" or "MultinomialNB()":
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
        # ('CountVect&data', transfo_text_CountVect , 'clean'),
            ('Scaler&data',MinMaxScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    else : 
        preparation = ColumnTransformer(
        transformers=[
        ('TFid&data', transfo_text_TFid , 'clean'), #TFIDF ne prend pas de listes comme arguments
        # ('CountVect&data', transfo_text_CountVect , 'clean'),
            ('Scaler&data',RobustScaler(), column_num),
            ('BoolEncoder',OrdinalEncoder(), column_bool)
        ])
    
    #relie l'algorithme avec le modèle
    model = Pipeline([
    ('vectorizer', preparation),
    ('classifier', classifier)
    ])
    #Fit le modèle
    model.fit(X_train, y_train)
    return model

In [60]:
#fonction permettant de connaître le score de notre modèle
def AfficherScores(y_test, y_pred,model,x):
    
    #affiche l'accuracy du modèle
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #affiche la classification report du modèle
    print(classification_report(y_test, y_pred))
    #affiche les graphiques du modèle
    """plt.hist(model.decision_function(x), bins=50)
    plt.show()"""

In [61]:
# fonction qui affiche la matrice de confusion du modèle
def matrixconf(y_test,y_pred):
    #affiche la matrice de confusion du modèle
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [62]:
#fonction permettant de tester le modèle
def testModel(sms,model):
    #mise en place d'un input
    input_sms     = [sms]
    #transformation en dataframe afin de l'utiliser correctement
    df_sms        = pd.DataFrame(input_sms)
    df_sms.rename(columns={0:'mail'}, inplace=True)
    #mise en place du preprocessing
    df_sms        = prep(df_sms)
    #mise en place des feature
    df_sms        = features(df_sms)
    #prediction 
    result = model.predict(df_sms)
    return result

In [63]:
#mise en place des modèles
classifier1 = LogisticRegression(solver='liblinear', C=1e3)
classifier2 = KNeighborsClassifier(7)

classifier3 = ComplementNB()                                        
classifier4 = MultinomialNB()                                     
classifier5 = BernoulliNB(force_alpha=True)

classifier6 = SVC()                                     
classifier7 = SVC(gamma=2, C=1, random_state=42)                   

classifier8 = RidgeClassifier(tol=1e-2, solver="sparse_cg")          
classifier9 = RandomForestClassifier(max_depth=200, random_state=42) 
classifier10 = DecisionTreeClassifier()                             

list_model = [classifier1,classifier2,classifier3,classifier4,classifier5,classifier6,classifier7,classifier8,classifier9,classifier10]

In [64]:
dfModel = cree_df("SMSSpamCollection.txt")
dfModel = prep(dfModel)
dfModel = features(dfModel)
X_train, X_test, y_train, y_test = spliteur(dfModel)

In [65]:
model_lm=ModelCreateur(X_train, y_train, classifier3)
model_lm

In [66]:
input =  'Hi Nick. This is to remind you about the $75 minimum payment on your credit card ending in XXXX. Payment is due on 01/01. Pls visit order.com to make your payment'
for i in list_model:
    model_lm=ModelCreateur(X_train, y_train, i)
    print('model utilisé:', i)
    print(testModel(input,model_lm))

model utilisé: LogisticRegression(C=1000.0, solver='liblinear')
['ham']
model utilisé: KNeighborsClassifier(n_neighbors=7)
['spam']
model utilisé: ComplementNB()
['ham']
model utilisé: MultinomialNB()
['ham']
model utilisé: BernoulliNB(force_alpha=True)
['spam']
model utilisé: SVC()
['ham']
model utilisé: SVC(C=1, gamma=2, random_state=42)
['ham']
model utilisé: RidgeClassifier(solver='sparse_cg', tol=0.01)
['ham']
model utilisé: RandomForestClassifier(max_depth=200, random_state=42)
['spam']
model utilisé: DecisionTreeClassifier()
['spam']


In [67]:
"""for i in list_model:
    model_lm=ModelCreateur(X_train, y_train, i)
    print('model utilisé:', i)
    y_pred = testModel(X_test,model_lm)
    matrixconf(y_test,y_pred)
    AfficherScores(y_test, y_pred,model_lm,X_test)
    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)"""

"for i in list_model:\n    model_lm=ModelCreateur(X_train, y_train, i)\n    print('model utilisé:', i)\n    y_pred = testModel(X_test,model_lm)\n    matrixconf(y_test,y_pred)\n    AfficherScores(y_test, y_pred,model_lm,X_test)\n    model_disp = RocCurveDisplay.from_estimator(model_lm,X_test,y_test)"