In [41]:
#загрузка и импорт библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer,  TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import re
import string
import spacy

In [None]:
#загрузка датасета и проверка
data = pd.read_csv("spam_or_not_spam.csv")
data.head() 

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [None]:
#изменение столбцов и стоп-слова
data = data[['email','label']].rename(columns={'email':'text', 'label' :'label "spam"'})
vocab = spacy.load("en_core_web_sm")
sw = vocab.Defaults.stop_words
' '.join(sw)
data['text']= data['text'].astype(str)

##CountVectorizer

In [None]:
#загрузка и импорт библиотек
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

In [None]:
#токен
data['cleaned_text_sm'] = data['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in vocab(x) if
        not token.is_stop                              
        and not token.is_punct                          
        and not token.is_space                          
    )
)

In [None]:
#обучение
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text_sm'], data['label "spam"'], random_state=1000)
vectorizer = CountVectorizer(max_df=0.8, min_df=0.001)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
#предсказание
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X_train_vectorized, y_train)
preds = dummy_clf.predict(X_test_vectorized)
print(classification_report(y_test, preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91       622
           1       0.00      0.00      0.00       128

    accuracy                           0.83       750
   macro avg       0.41      0.50      0.45       750
weighted avg       0.69      0.83      0.75       750



In [None]:
#логистическая регрессия
lr = LogisticRegression().fit(X_train_vectorized, y_train)
result = lr.predict(X_test_vectorized)
print(classification_report(y_test, result))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       622
           1       1.00      0.95      0.97       128

    accuracy                           0.99       750
   macro avg       0.99      0.97      0.98       750
weighted avg       0.99      0.99      0.99       750



##TfidfTransformer и TfidfVectorizer


In [None]:
p = Pipeline(
    steps=[
        ('counter', CountVectorizer(max_df=0.8, min_df=0.001)),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression())
    ]
).fit(X_train, y_train)
prediction = p.predict(X_test)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       622
           1       1.00      0.77      0.87       128

    accuracy                           0.96       750
   macro avg       0.98      0.88      0.92       750
weighted avg       0.96      0.96      0.96       750



In [None]:
#sub
def custom_tokenize(text):
  text = re.sub(r'[^a-zA-Z ]', '', text)
  return text.split()

In [None]:
pd = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer(max_df=0.7, min_df=0.003, tokenizer=custom_tokenize)),
        ('clf', LogisticRegression())
    ]
).fit(X_train, y_train)
prediction = p.predict(X_test)
print(classification_report(y_test, prediction))



              precision    recall  f1-score   support

           0       0.96      1.00      0.98       622
           1       1.00      0.78      0.88       128

    accuracy                           0.96       750
   macro avg       0.98      0.89      0.93       750
weighted avg       0.96      0.96      0.96       750



## Логистическая регрессия для TfidfVectorizer

In [None]:
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ]
)

In [None]:
#параметры
parameter_grid = {
    "tfidf__max_df": np.linspace(0.3, 0.7, 20),
    "tfidf__min_df": [0.0, 0.001, 0.003, 0.005, 0.007],
    "tfidf__ngram_range": ((1, 1), (1, 2)), 
    "tfidf__norm": ("l1", "l2"),
    "clf__C": np.linspace(0.1, 1, 10),
}

In [None]:
#обучение
%%time

grid_search = HalvingGridSearchCV(pipe,param_grid=parameter_grid,n_jobs=-1,verbose=1,cv=2,scoring='accuracy')
grid_search.fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 8
n_possible_iterations: 6
min_resources_: 8
max_resources_: 2250
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4000
n_resources: 8
Fitting 2 folds for each of 4000 candidates, totalling 8000 fits
----------
iter: 1
n_candidates: 1334
n_resources: 24
Fitting 2 folds for each of 1334 candidates, totalling 2668 fits
----------
iter: 2
n_candidates: 445
n_resources: 72
Fitting 2 folds for each of 445 candidates, totalling 890 fits
----------
iter: 3
n_candidates: 149
n_resources: 216
Fitting 2 folds for each of 149 candidates, totalling 298 fits
----------
iter: 4
n_candidates: 50
n_resources: 648
Fitting 2 folds for each of 50 candidates, totalling 100 fits
----------
iter: 5
n_candidates: 17
n_resources: 1944
Fitting 2 folds for each of 17 candidates, totalling 34 fits
CPU times: user 29.5 s, sys: 2.28 s, total: 31.8 s
Wall time: 5min 58s


In [None]:
#результат
logregtfidf=grid_search
logregtfidf.best_score_
result = logregtfidf.best_estimator_.predict(X_test)
print(classification_report(y_test, result))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       622
           1       1.00      0.83      0.91       128

    accuracy                           0.97       750
   macro avg       0.98      0.91      0.94       750
weighted avg       0.97      0.97      0.97       750



##DecisionTreeClassifier для TfidfVectorizer

In [36]:
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', DecisionTreeClassifier())
    ]
)

In [37]:
parameter_grid = {
    "tfidf__max_df": np.linspace(0.3, 0.7, 10),
    "tfidf__min_df": [ 0.0, 0.001, 0.003, 0.005, 0.007],
    "tfidf__ngram_range": ((1, 1), (1, 2)), 
    "tfidf__norm": ("l1", "l2"),
    "clf__criterion": ("gini" , "entropy" , "log_loss"),
}

In [38]:
%%time

grid_search = HalvingGridSearchCV(pipe,param_grid=parameter_grid,n_jobs=-1,verbose=1,cv=2,scoring='accuracy')
grid_search.fit(X_train, y_train)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 9
max_resources_: 2250
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 600
n_resources: 9
Fitting 2 folds for each of 600 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 200
n_resources: 27
Fitting 2 folds for each of 200 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 67
n_resources: 81
Fitting 2 folds for each of 67 candidates, totalling 134 fits
----------
iter: 3
n_candidates: 23
n_resources: 243
Fitting 2 folds for each of 23 candidates, totalling 46 fits
----------
iter: 4
n_candidates: 8
n_resources: 729
Fitting 2 folds for each of 8 candidates, totalling 16 fits
----------
iter: 5
n_candidates: 3
n_resources: 2187
Fitting 2 folds for each of 3 candidates, totalling 6 fits
CPU times: user 4.03 s, sys: 291 ms, total: 4.32 s
Wall time: 57.5 s


In [39]:
result = grid_search
result.best_score_

0.944647758462946

##Naive Bayes

In [42]:
pipe = Pipeline(
    steps=[
        ('count_vect', CountVectorizer()),
        ('clf', MultinomialNB())
    ]
)

In [43]:
parameter_grid = {
    "count_vect__max_df": np.linspace(0.3, 0.7, 10),
    "count_vect__min_df": [ 0.003, 0.005, 0.007],
    "count_vect__ngram_range": ((1, 1), (1, 2)),
}

In [44]:
%%time

grid_search = HalvingGridSearchCV(pipe,param_grid=parameter_grid,n_jobs=-1,verbose=1,cv=2,scoring='accuracy')
grid_search.fit(X_train, y_train)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 83
max_resources_: 2250
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 60
n_resources: 83
Fitting 2 folds for each of 60 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 20
n_resources: 249
Fitting 2 folds for each of 20 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 7
n_resources: 747
Fitting 2 folds for each of 7 candidates, totalling 14 fits
----------
iter: 3
n_candidates: 3
n_resources: 2241
Fitting 2 folds for each of 3 candidates, totalling 6 fits
CPU times: user 1.12 s, sys: 125 ms, total: 1.25 s
Wall time: 23.1 s


In [45]:
result=grid_search
result.best_score_

0.9772321428571429