# Установка библиотек

In [None]:
!pip install pandas
!pip install nltk
!pip install scikit-learn
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [3]:
import pandas as pd
import numpy as np
import itertools

import spacy
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, classification_report

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Загрузка данных

In [4]:
data = pd.read_csv('spam_or_not_spam.csv')
data.dropna(inplace=True)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2999 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   2999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 70.3+ KB


Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


# Обработка текстовых данных

In [5]:
nlp = spacy.load("en_core_web_sm")
data['text'] = data['email'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3000)

# Векторизация текстовых данных с использованием CountVectorizer и TfidfVectorizer

In [6]:
count_vectorizer = CountVectorizer(max_df=0.7, min_df=0.003, stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.003, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Сравнение способов векторизации на примере классификатора MultinomialNB

In [7]:
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train)
y_pred_count = nb_count.predict(X_test_count)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

f1_count = f1_score(y_test, y_pred_count, average='weighted')
f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

report_count = classification_report(y_test, y_pred_count)
report_tfidf = classification_report(y_test, y_pred_tfidf)

print("CountVectorizer F1 Score:", f1_count)
print("TfidfVectorizer F1 Score:", f1_tfidf)

print("CountVectorizer classification report:")
print(report_count)

print("TfidfVectorizer classification report:")
print(report_tfidf)

CountVectorizer F1 Score: 0.9812696932972873
TfidfVectorizer F1 Score: 0.9667954545454545
CountVectorizer classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       630
           1       0.95      0.93      0.94       120

    accuracy                           0.98       750
   macro avg       0.97      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750

TfidfVectorizer classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       630
           1       0.98      0.82      0.89       120

    accuracy                           0.97       750
   macro avg       0.97      0.91      0.94       750
weighted avg       0.97      0.97      0.97       750


In [8]:
best_vectorizer = "CountVectorizer" if f1_count > f1_tfidf else "TfidfVectorizer"
print(f"Best vectorization method: {best_vectorizer}")

Best vectorization method: CountVectorizer


# Каждая модель обучается с разными комбинациями параметров, а затем оценивается с использованием кросс-валидации

In [9]:
models = {
    "DecisionTree": (DecisionTreeClassifier(random_state=1000), {
        'max_depth': [None, 10, 20, 30, 50, 100],
        'min_samples_split': [2, 5, 10, 20, 50],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ["gini", "entropy"]
    }),
    "LogisticRegression": (LogisticRegression(max_iter=10000), {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'solver': ['liblinear', 'lbfgs', 'newton-cg'],
        'class_weight': [None, 'balanced']
    }),
    "NaiveBayes": (MultinomialNB(), {
        'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0]
    })
}

vectorizers = {
    "CountVectorizer": X_train_count,
    "TfidfVectorizer": X_train_tfidf
}

results = {}

combinations = list(itertools.product(vectorizers.items(), models.items()))

for (vectorizer_name, X_train_vec), (model_name, (model, param_grid)) in combinations:
    print(f"Training {model_name} using {vectorizer_name}...")

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train_vec, y_train)
    best_model = grid_search.best_estimator_

    cv_scores = cross_val_score(best_model, X_train_vec, y_train, cv=5)
    mean_cv_score = np.mean(cv_scores)

    results[f"{model_name} with {vectorizer_name}"] = {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'cv_scores': cv_scores,
        'mean_cv_score': mean_cv_score
    }

for model_name, result in results.items():
    print(f"{model_name} Best parameters: {result['best_params']}")
    print(f"{model_name} Cross-validation scores: {result['cv_scores']}")
    print(f"{model_name} Mean cross-validation score: {result['mean_cv_score']}")

Training DecisionTree using CountVectorizer...
Training LogisticRegression using CountVectorizer...
Training NaiveBayes using CountVectorizer...
Training DecisionTree using TfidfVectorizer...
Training LogisticRegression using TfidfVectorizer...
Training NaiveBayes using TfidfVectorizer...
DecisionTree with CountVectorizer Best parameters: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 2}
DecisionTree with CountVectorizer Cross-validation scores: [0.96222222 0.94222222 0.95333333 0.95777778 0.94654788]
DecisionTree with CountVectorizer Mean cross-validation score: 0.9524206879485275
LogisticRegression with CountVectorizer Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'solver': 'liblinear'}
LogisticRegression with CountVectorizer Cross-validation scores: [0.98222222 0.98666667 0.98444444 0.97777778 0.98663697]
LogisticRegression with CountVectorizer Mean cross-validation score: 0.9835496164315763
NaiveBayes with CountVectorizer Best parameter

# Сравнение качества обученных моделей

In [10]:
best_f1 = 0  
best_model_name = "" 

for model_name, result in results.items():
    best_model = result['best_model']
    X_test_vec = X_test_count if "CountVectorizer" in model_name else X_test_tfidf
    y_pred = best_model.predict(X_test_vec)
    f1 = f1_score(y_test, y_pred, average='weighted')

    if f1 > best_f1:
        best_f1 = f1
        best_model_name = model_name

    report = classification_report(y_test, y_pred)

    print(f"{model_name} Test f1: {f1}")
    print(f"{model_name} Test classification report:")
    print(report)

DecisionTree with CountVectorizer Test f1: 0.9549647542486961
DecisionTree with CountVectorizer Test classification report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       630
           1       0.91      0.81      0.85       120

    accuracy                           0.96       750
   macro avg       0.94      0.90      0.91       750
weighted avg       0.96      0.96      0.95       750

LogisticRegression with CountVectorizer Test f1: 0.9946666666666667
LogisticRegression with CountVectorizer Test classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       630
           1       0.98      0.98      0.98       120

    accuracy                           0.99       750
   macro avg       0.99      0.99      0.99       750
weighted avg       0.99      0.99      0.99       750

NaiveBayes with CountVectorizer Test f1: 0.9824541768430038
NaiveBayes with CountVecto

In [11]:
print(f"The best model is: {best_model_name} with test F1 score: {best_f1}")

The best model is: LogisticRegression with CountVectorizer with test F1 score: 0.9946666666666667
