In [None]:
import time
start_time = time.time()

In [36]:
import pandas as pd
import numpy as np
import re
import string
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB

from sklearn.metrics import classification_report, accuracy_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, KFold, cross_val_score, train_test_split, StratifiedKFold, GridSearchCV

import matplotlib.pyplot as plt

In [2]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Загрузка датасета "Spam or not spam":

In [3]:
data = pd.read_csv("/content/spam_or_not_spam.csv")
data.style.set_properties(**{'text-align': 'left'})
data.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


Оценим баланс классов:

In [4]:
data.label.value_counts()

0    2500
1     500
Name: label, dtype: int64

Загрузка набора английских стоп-слов из библиотеки Spacy:

In [5]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
', '.join(stopwords)

Spacy english stopwords size: 326



"'s, forty, make, whereupon, will, until, whereas, off, part, seeming, a, elsewhere, everything, yet, formerly, one, unless, his, thereupon, fifty, with, serious, regarding, enough, although, latterly, really, never, nobody, throughout, seems, call, do, herself, more, again, full, nevertheless, because, becomes, hereby, ten, ’ll, sixty, why, first, namely, whatever, how, well, thru, beyond, sometime, hundred, above, themselves, itself, wherever, always, either, over, together, say, being, from, any, made, ca, it, then, around, bottom, that, ever, somehow, 've, before, are, whole, another, eleven, else, within, where, there, each, often, when, since, though, and, nor, hereupon, however, via, thereafter, ‘m, once, into, ourselves, used, such, others, just, show, hers, if, former, moreover, to, put, through, could, some, thus, become, nine, anyone, third, on, was, quite, did, several, can, see, upon, front, which, anyway, would, during, under, perhaps, than, otherwise, ‘s, except, seemed,

Удаление пустых и Nan строк из датасета:

In [6]:
data.drop(data[data.email == ' '].index, inplace=True)
data.drop(data[data['email'].isnull()].index, inplace=True)

Лемматизация и токенизация при помощи Spacy:

In [7]:
%%time
data['cleaned_text'] = data['email'].str.replace('NUMBER', '') # удаляем слово "NUMBER" из текста, так как оно заменяет собой все числа и встречается чаще любого другого слова
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join(token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct # удаление пунктуации
        and not token.is_digit # удаление цифр
        and not token.like_email # удаление почтовых адресов
        and not token.like_num # удаление чисел, в том числе в виде текста
        and not token.is_space # удаление пробельных символов
    )
)

data.sample(3, random_state=1)

CPU times: user 2min 34s, sys: 1.24 s, total: 2min 35s
Wall time: 2min 59s


Unnamed: 0,email,label,cleaned_text
748,at NUMBER NUMBER pm NUMBER on NUMBER NUMBER NU...,0,pm tom write green say spot owl hadn t exist i...
2883,pocket the newest NUMBER year annuity pocket ...,1,pocket new year annuity pocket new year annuit...
1391,justin mason jm jmason org NUMBER NUMBER NUMBE...,0,justin mason jm jmason org point aim rescore a...


Разобьем выборку на тренировочную и тестовую часть, зафиксируем random_state:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], shuffle=True, random_state=1234, train_size=0.7)

# Векторизация при помощи CountVectorizer

In [9]:
vectorizer = CountVectorizer()

X_train_bow_vectorized = vectorizer.fit_transform(X_train)
X_test_bow_vectorized = vectorizer.transform(X_test)

In [10]:
len(vectorizer.get_feature_names_out())

24072

In [11]:
vectorizer.get_feature_names_out()[2550:2650]

array(['bring', 'brique', 'britain', 'britan', 'britannia', 'british',
       'britney', 'briton', 'britons', 'brklisttmgepveytvbet', 'brksalrq',
       'brmodyyfnrlnvopxxmo', 'broad', 'broadband', 'broadcast',
       'broadcaster', 'broadcasting', 'broaden', 'broadening', 'broadly',
       'broadsheet', 'broadway', 'brocard', 'brochure', 'brochureware',
       'broil', 'broinn', 'broken', 'broker', 'brokerage', 'bromide',
       'bronson', 'bronze', 'broo', 'brooklyn', 'brooks', 'broom',
       'brose', 'brother', 'brotherhood', 'brotherton', 'brough', 'brown',
       'browse', 'browser', 'browsing', 'brruexlswclc', 'bruce', 'bruceg',
       'bruise', 'brunei', 'brunet', 'bruno', 'brunswick', 'brush',
       'brussels', 'brutal', 'brutality', 'brutalize', 'brutally',
       'brute', 'bryan', 'bs', 'bsbdfcrxnkmlt', 'bsd', 'bsddb', 'bsh',
       'bsmtp', 'bsp', 'bspyjykcbbe', 'bssejvecagazaaadhbuvqdkqomklnhix',
       'bssqftuds', 'bst', 'bstringfield',
       'bsuvzlvftsylsxzfqsttymckk

Представление текстов в виде мешка слов:

In [12]:
bag_count_vect = pd.DataFrame(X_train_bow_vectorized.toarray(), columns=vectorizer.get_feature_names_out())

# Векторизация при помощи TfidfVectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf_vectorized = tfidf_vectorizer.transform(X_test)

In [14]:
tfidf_vectorizer.get_feature_names_out()[1550:1650]

array(['avoir', 'avoisinante', 'avow',
       'avqcntxrhbjdjqipajpzxfacapkwkxqqvzs',
       'avrzlnrjrgslmdrwavgcnrwinhflritlzp', 'avyncusmrtgtlwlfkbyfn',
       'aw', 'await', 'awake', 'award', 'aware', 'awareness', 'away',
       'awcbwunahmxo', 'awdgvlaxitmbe', 'awe', 'awesome', 'awful',
       'awfully', 'awhile', 'awk', 'awkpxuovkyi',
       'awkuolcaljxrjlmlapmntvbpk', 'awkward', 'awl',
       'awubpwmbspxhjfohaeqlqgcgxhqeeuesnvuwgvmjfksanrb',
       'awubpxnfowyzbxegeqlfnwcfaknapokbgjjqqqehgiwfdsaopj',
       'awubpxzdpxhjfohaeqiaacgvmtbvktprjmjqtoyzyifzusan',
       'awubpycpxhjfohaeqlasgcfzhsqmsvuygqjwgldwzkpihmanrur',
       'awubpyyyspxhjfohaeqlqnqcgyvfnvywkirrghhdisbonconaopcr',
       'awubpzayrijkhjbjytpqeqiyzgcg', 'awvsuujywwmnpqrjwqmjplyvtqzhzzhk',
       'axe', 'axel', 'axf', 'axfr', 'axlbjrrsyyrtxoyrcuybydztzzxeos',
       'axle', 'axpgx', 'axtfrpxfcmuwyaljocwfir',
       'axwvyalyzqcyzkaplpturlkltbljy',
       'axycduvaomverzvrvzjsizbryjxwaqkdovuxtjxhvusxiif', 'axzn'

In [15]:
bag_tfidf_vect = pd.DataFrame(X_train_tfidf_vectorized.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

Посмотрим на TF-IDF слова "yes":

In [16]:
bag_tfidf_vect[bag_tfidf_vect['yes'] > 0].sort_values(by=["yes"], ascending=False)['yes']

1702    0.183271
1738    0.161631
237     0.151156
207     0.139732
1778    0.133211
          ...   
964     0.010084
1745    0.010070
1305    0.009293
1285    0.005142
1565    0.003704
Name: yes, Length: 170, dtype: float64

In [17]:
bag_tfidf_vect.shape, bag_count_vect.shape, data.shape

((2097, 24072), (2097, 24072), (2997, 3))

# Кросс-валидация:

In [41]:
classifiers = [DecisionTreeClassifier(),
               LogisticRegression(),
               BernoulliNB(),
               MultinomialNB(),
               ComplementNB()]

pipe_dict = {0: "DecisionTreeClassifier", 1: "LogisticRegression", 2: "Naive Bayes Bernoulli", 3: "Naive Bayes Multinomial", 4: "Complement Naive Bayes"}

In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2786)

Оценки для различных моделей и способов векторизации:

In [43]:
print('CountVectorizer')
for i, model in enumerate(classifiers):
    cv_score = cross_val_score(model, X_train_bow_vectorized, y_train, scoring="accuracy", cv=skf)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

CountVectorizer
DecisionTreeClassifier: 0.951832 
LogisticRegression: 0.981402 
Naive Bayes Bernoulli: 0.866955 
Naive Bayes Multinomial: 0.984265 
Complement Naive Bayes: 0.982359 


In [44]:
print("TfidfVectorizer")
for i, model in enumerate(classifiers):
    cv_score = cross_val_score(model, X_train_tfidf_vectorized, y_train, scoring="accuracy", cv=skf)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

TfidfVectorizer
DecisionTreeClassifier: 0.955168 
LogisticRegression: 0.944213 
Naive Bayes Bernoulli: 0.866955 
Naive Bayes Multinomial: 0.882215 
Complement Naive Bayes: 0.939922 


Для CountVectorizer возьму Naive Bayes Multinomial, а для TfidfVectorizer Complement Naive Bayes, так они показывают лучшие результаты

## Обучение моделей:

Выполним подбор гипер-параметров через пайплайн сразу для векторизации при помощи CountVectorizer и моделей:

### LR + CountVectorizer

In [None]:
pipe_counter_lr = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression())
    ]
)

In [None]:
parameter_grid = {'counter__max_df': np.arange(0.5, 1, 0.1),
                  'counter__min_df': np.arange(0.001, 0.01, 0.001),
                  "counter__ngram_range": ((1, 1), (1, 2)),
                  "tfidf__norm": ("l1", "l2"),
                  "clf__C": [0.001, 0.01, 0.1, 1, 10, 100]}

In [None]:
%%time

grid_search_counter_lr = HalvingGridSearchCV(
    pipe_counter_lr,
    param_grid=parameter_grid,
    n_jobs=-1,
    verbose=1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_counter_lr.fit(X_train, y_train);

n_iterations: 5
n_required_iterations: 7
n_possible_iterations: 5
min_resources_: 20
max_resources_: 2097
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1080
n_resources: 20
Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
----------
iter: 1
n_candidates: 360
n_resources: 60
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
----------
iter: 2
n_candidates: 120
n_resources: 180
Fitting 5 folds for each of 120 candidates, totalling 600 fits
----------
iter: 3
n_candidates: 40
n_resources: 540
Fitting 5 folds for each of 40 candidates, totalling 200 fits
----------
iter: 4
n_candidates: 14
n_resources: 1620
Fitting 5 folds for each of 14 candidates, totalling 70 fits
CPU times: user 25.3 s, sys: 2.4 s, total: 27.7 s
Wall time: 6min 50s


In [None]:
grid_search_counter_lr.best_params_

{'clf__C': 100,
 'counter__max_df': 0.7,
 'counter__min_df': 0.007,
 'counter__ngram_range': (1, 2),
 'tfidf__norm': 'l2'}

Результаты для LogisticRegression + CountVectorizer:

In [None]:
lr_counter = grid_search_counter_lr.best_estimator_.predict(X_test)
accuracy_lr_counter = accuracy_score(y_test, lr_counter)
recall_lr_counter = recall_score(y_test, lr_counter)

print(classification_report(y_test, lr_counter))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       748
           1       0.99      0.95      0.97       152

    accuracy                           0.99       900
   macro avg       0.99      0.98      0.98       900
weighted avg       0.99      0.99      0.99       900



### DecisionTree + CountVectorizer

In [None]:
pipe_counter_dclf = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', DecisionTreeClassifier())
    ]
)

In [None]:
parameter_grid = {'counter__max_df': np.arange(0.5, 1, 0.1),
                  'counter__min_df': np.arange(0.001, 0.01, 0.001),
                  "counter__ngram_range": [(1, 2)],
                  "tfidf__norm": ("l1", "l2"),
                  "clf__max_depth": [8, 9, 10, 12, 14, 16],
                  "clf__min_samples_split": [5, 10, 15],
                  "clf__criterion": ['gini', 'entropy'],
                  'clf__max_features': ['sqrt']
}

In [None]:
%%time

grid_search_counter_dclf = HalvingGridSearchCV(
    pipe_counter_dclf,
    param_grid=parameter_grid,
    n_jobs=-1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_counter_dclf.fit(X_train, y_train);

CPU times: user 3.6 s, sys: 372 ms, total: 3.98 s
Wall time: 1min 4s


In [None]:
grid_search_counter_dclf.best_params_

{'clf__criterion': 'entropy',
 'clf__max_depth': 14,
 'clf__max_features': 'sqrt',
 'clf__min_samples_split': 15,
 'counter__max_df': 0.9,
 'counter__min_df': 0.009,
 'counter__ngram_range': (1, 2),
 'tfidf__norm': 'l1'}

Результаты для DecisionTreeClassifier + CountVectorizer:

In [None]:
dclf_counter = grid_search_counter_dclf.best_estimator_.predict(X_test)
accuracy_dclf_counter = accuracy_score(y_test, dclf_counter)
recall_dclf_counter = recall_score(y_test, dclf_counter)

print(classification_report(y_test, dclf_counter))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       748
           1       0.84      0.74      0.79       152

    accuracy                           0.93       900
   macro avg       0.89      0.85      0.87       900
weighted avg       0.93      0.93      0.93       900



### Naive Bayes + CountVectorizer

In [28]:
pipe_counter_nb = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB())
    ]
)

In [29]:
parameter_grid = {'counter__max_df': np.arange(0.5, 1, 0.1),
                  'counter__min_df': np.arange(0.005, 0.01, 0.001),
                  "counter__ngram_range": ((1, 1), (1, 2)),
                  "tfidf__norm": ("l1", "l2"),
                  'clf__alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
                  'clf__fit_prior': [True, False]
}

In [30]:
%%time

grid_search_counter_nb = HalvingGridSearchCV(
    pipe_counter_nb,
    param_grid=parameter_grid,
    n_jobs=-1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_counter_nb.fit(X_train, y_train);

CPU times: user 20.5 s, sys: 2.06 s, total: 22.6 s
Wall time: 4min 44s


In [31]:
grid_search_counter_nb.best_params_

{'clf__alpha': 0.1,
 'clf__fit_prior': False,
 'counter__max_df': 0.6,
 'counter__min_df': 0.005,
 'counter__ngram_range': (1, 2),
 'tfidf__norm': 'l2'}

Результаты для Naive Bayes + CountVectorizer:

In [32]:
nb_counter = grid_search_counter_nb.best_estimator_.predict(X_test)
accuracy_nb_counter = accuracy_score(y_test, nb_counter)
recall_nb_counter = recall_score(y_test, nb_counter)

print(classification_report(y_test, nb_counter))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       748
           1       0.91      0.97      0.94       152

    accuracy                           0.98       900
   macro avg       0.95      0.97      0.96       900
weighted avg       0.98      0.98      0.98       900



Подбор гипер-параметров через пайплайн для векторизации при помощи TfidfVectorizer и моделей:

### LR + TfidfVectorizer


In [None]:
pipe_tf_lr = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ]
)

In [None]:
parameter_grid = {'tfidf__max_df': np.arange(0.5, 1, 0.1),
                  'tfidf__min_df': np.arange(0.005, 0.01, 0.001),
                  "tfidf__ngram_range": ((1, 1), (1, 2)),
                  "tfidf__norm": ("l1", "l2"),
                  "clf__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

In [None]:
%%time

grid_search_tf_lr = HalvingGridSearchCV(
    pipe_tf_lr,
    param_grid=parameter_grid,
    n_jobs=-1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_tf_lr.fit(X_train, y_train);

CPU times: user 13.4 s, sys: 1.2 s, total: 14.6 s
Wall time: 3min 28s


In [None]:
grid_search_tf_lr.best_params_

{'clf__C': 100,
 'tfidf__max_df': 0.7,
 'tfidf__min_df': 0.007,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2'}

Результаты для LogisticRegression + TfidfVectorizer:

In [None]:
lr_tf = grid_search_tf_lr.best_estimator_.predict(X_test)
accuracy_lr_tf = accuracy_score(y_test, lr_tf)
recall_lr_tf = recall_score(y_test, lr_tf)

print(classification_report(y_test, lr_tf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       748
           1       0.99      0.95      0.97       152

    accuracy                           0.99       900
   macro avg       0.99      0.98      0.98       900
weighted avg       0.99      0.99      0.99       900



### DecisionTree + TfidfVectorizer



In [None]:
pipe_tf_dclf = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', DecisionTreeClassifier())
    ]
)

In [None]:
parameter_grid = {'tfidf__max_df': np.arange(0.5, 1, 0.1),
                  'tfidf__min_df': np.arange(0.005, 0.01, 0.001),
                  "tfidf__ngram_range": ((1, 1), (1, 2)),
                  "tfidf__norm": ("l1", "l2"),
                  "clf__max_depth": [8, 9, 10, 12, 14, 16],
                  "clf__min_samples_split": [5, 10, 15],
                  "clf__criterion": ['gini'],
                  'clf__max_features': ['sqrt', 'log2']
}

⬇ Не запускать, работает 21 минуту 27 секунд



In [None]:
%%time

grid_search_tf_dclf = HalvingGridSearchCV(
    pipe_tf_dclf,
    param_grid=parameter_grid,
    n_jobs=-1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_tf_dclf.fit(X_train, y_train);

CPU times: user 54.7 s, sys: 4.63 s, total: 59.4 s
Wall time: 14min 23s


In [None]:
grid_search_tf_dclf.best_params_

{'clf__criterion': 'gini',
 'clf__max_depth': 16,
 'clf__max_features': 'sqrt',
 'clf__min_samples_split': 5,
 'tfidf__max_df': 0.5,
 'tfidf__min_df': 0.006,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2'}

Результаты для DecisionTree + TfidfVectorizer:

In [None]:
dclf_tf = grid_search_tf_dclf.best_estimator_.predict(X_test)
accuracy_dclf_tf = accuracy_score(y_test, dclf_tf)
recall_dclf_tf = recall_score(y_test, dclf_tf)

print(classification_report(y_test, dclf_tf))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95       748
           1       0.83      0.61      0.70       152

    accuracy                           0.91       900
   macro avg       0.88      0.79      0.82       900
weighted avg       0.91      0.91      0.91       900



### Naive Bayes + TfidfVectorizer



In [45]:
pipe_tf_nb = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', ComplementNB())
    ]
)

In [46]:
parameter_grid = {'tfidf__max_df': np.arange(0.5, 1, 0.1),
                  'tfidf__min_df': np.arange(0.005, 0.01, 0.001),
                  "tfidf__ngram_range": ((1, 1), (1, 2)),
                  "tfidf__norm": ("l1", "l2"),
                  'clf__alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
                  'clf__fit_prior': [True, False]
}

In [47]:
%%time

grid_search_tf_nb = HalvingGridSearchCV(
    pipe_tf_nb,
    param_grid=parameter_grid,
    n_jobs=-1,
    cv=skf,
    scoring='accuracy',
    random_state=42,
)

grid_search_tf_nb.fit(X_train, y_train);

CPU times: user 15.7 s, sys: 1.53 s, total: 17.2 s
Wall time: 4min 55s


In [49]:
grid_search_tf_nb.best_params_

{'clf__alpha': 5.0,
 'clf__fit_prior': True,
 'tfidf__max_df': 0.6,
 'tfidf__min_df': 0.008,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2'}

Результаты для Naive Bayes + TfidfVectorizer:

In [50]:
nb_tf = grid_search_tf_nb.best_estimator_.predict(X_test)
accuracy_nb_tf = accuracy_score(y_test, nb_tf)
recall_nb_tf = recall_score(y_test, nb_tf)

print(classification_report(y_test, nb_tf))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       748
           1       0.86      0.95      0.91       152

    accuracy                           0.97       900
   macro avg       0.93      0.96      0.94       900
weighted avg       0.97      0.97      0.97       900



## Сравнение моделей

Сравним все полученные модели более наглядно:

In [51]:
print("CountVectorizer")
print("")
print("Accuracy")
print(f'LogisticRegression: {round(accuracy_lr_counter, 4)}, DecisionTreeClasifier: {round(accuracy_dclf_counter, 4)}, Naive Bayes: {round(accuracy_nb_counter, 4)}')
print("")
print("Recall")
print(f'LogisticRegression: {round(recall_lr_counter, 4)}, DecisionTreeClasifier: {round(recall_dclf_counter, 4)}, Naive Bayes: {round(recall_nb_counter, 4)}')

CountVectorizer

Accuracy


NameError: ignored

In [52]:
print("TfidfVectorizer")
print("")
print("Accuracy")
print(f'LogisticRegression: {round(accuracy_lr_tf, 4)}, DecisionTreeClasifier: {round(accuracy_dclf_tf, 4)}, Naive Bayes: {round(accuracy_nb_tf, 4)}')
print("")
print("Recall")
print(f'LogisticRegression: {round(recall_lr_tf, 4)}, DecisionTreeClasifier: {round(recall_dclf_tf, 4)}, Naive Bayes: {round(recall_nb_tf, 4)}')

TfidfVectorizer

Accuracy


NameError: ignored

In [None]:
end_time = time.time()

execution_time = end_time - start_time
print(f"Время выполнения ноутбука: {execution_time // 60:.0f} минут(ы) {(execution_time - (execution_time // 60) * 60):.0f} секунд(ы)")

Время выполнения ноутбука: 1.00 минут 55.884095668792725
