In [249]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("spam_assassin.csv")

In [250]:
dataset.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [251]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


In [252]:
dataAmount = 1500
data, target = dataset.text[0:dataAmount], dataset.target[0:dataAmount]
print((sum(target), len(target) - sum(target)))

(494, 1006)


In [253]:
#create extra features
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def addFeature(X, feature_to_add):
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')


len_feature = [len(str) for str in data]
digitsCount_feature = [len(re.findall('[0-9]', string)) for string in data]
letterCount_feature = [len(re.findall('[A-z]', string)) for string in data]
_Count_feature = [string.count(r'_') for string in data]
dollarCount_feature = [string.count(r'$') for string in data]
questionMarkCount_feature = [string.count(r'?') for string in data]
exclamationMarkCount_feature = [string.count(r'!') for string in data]

tfidf_vectorizer = TfidfVectorizer()
data_vectorized = tfidf_vectorizer.fit_transform(data)
dataTransformed = addFeature(data_vectorized, [len_feature, digitsCount_feature,  letterCount_feature, _Count_feature, dollarCount_feature, questionMarkCount_feature, exclamationMarkCount_feature])


In [254]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(dataTransformed, target, test_size=0.2, random_state=42)

In [255]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print((X_train_smote.shape[0], y_train_smote.shape[0]))

(1606, 1606)


In [256]:
#dummy Classifier
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy="stratified") 
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.4866666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.47      0.55       203
           1       0.32      0.53      0.40        97

    accuracy                           0.49       300
   macro avg       0.50      0.50      0.48       300
weighted avg       0.56      0.49      0.50       300



In [257]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.9766666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       203
           1       0.98      0.95      0.96        97

    accuracy                           0.98       300
   macro avg       0.98      0.97      0.97       300
weighted avg       0.98      0.98      0.98       300



In [258]:
#gradient boosting 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

clf = GradientBoostingClassifier(random_state = 42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.25],
    'max_depth': [3, 10],
}
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train_smote, y_train_smote)

# Вывод лучших параметров и точности
print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая точность (CV):", grid_search.best_score_)

# Использование лучшей модели для прогнозирования
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие параметры: {'learning_rate': 0.25, 'max_depth': 3, 'n_estimators': 100}
Лучшая точность (CV): 0.9925349741684565
Accuracy: 0.9966666666666667
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       203
           1       1.00      0.99      0.99        97

    accuracy                           1.00       300
   macro avg       1.00      0.99      1.00       300
weighted avg       1.00      1.00      1.00       300



In [259]:
#Cat Boost Classifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = CatBoostClassifier(iterations=50, learning_rate=0.2, depth=3, random_state=42, verbose=0)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       203
           1       1.00      0.97      0.98        97

    accuracy                           0.99       300
   macro avg       0.99      0.98      0.99       300
weighted avg       0.99      0.99      0.99       300



In [260]:
#ADA boost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.9933333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       203
           1       1.00      0.98      0.99        97

    accuracy                           0.99       300
   macro avg       1.00      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300



In [261]:
# Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = ExtraTreesClassifier(random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9766666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       203
           1       0.98      0.95      0.96        97

    accuracy                           0.98       300
   macro avg       0.98      0.97      0.97       300
weighted avg       0.98      0.98      0.98       300



In [262]:
#Квадратичный дискриминантный анализ
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.