In [1]:
#load dataset
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("spam_assassin.csv")

In [2]:
dataset.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


In [4]:
dataAmount = 3115 #len(dataset.target)
data, target = dataset.text[0:dataAmount], dataset.target[0:dataAmount]
print("Amount of spam and not spam items")
print((sum(target), len(target) - sum(target)))

Amount of spam and not spam items
(1011, 2104)


In [5]:
#create extra features
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def addFeature(X, feature_to_add):
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')


len_feature = [len(str) for str in data]
digitsCount_feature = [len(re.findall('[0-9]', string)) for string in data]
letterCount_feature = [len(re.findall('[A-z]', string)) for string in data]
_Count_feature = [string.count(r'_') for string in data]
dollarCount_feature = [string.count(r'$') for string in data]
questionMarkCount_feature = [string.count(r'?') for string in data]
exclamationMarkCount_feature = [string.count(r'!') for string in data]

#vectorization 
tfidf_vectorizer = TfidfVectorizer()
data_vectorized = tfidf_vectorizer.fit_transform(data)
dataTransformed = addFeature(data_vectorized, [len_feature, digitsCount_feature,  letterCount_feature, _Count_feature, dollarCount_feature, questionMarkCount_feature, exclamationMarkCount_feature])

In [6]:
# Devide data on train and test part
X_train, X_test, y_train, y_test = train_test_split(dataTransformed, target, test_size=0.3, random_state=42)

In [7]:
#Removing class imbalance
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(sum(y_train), y_train.shape[0] - sum(y_train)) #amount iems marked as spam and as not spam
print(sum(y_train_smote), y_train_smote.shape[0] - sum(y_train_smote))

720 1460
1460 1460


In [8]:
#dummy Classifierfrom sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



clf = DummyClassifier(strategy="stratified") 
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5240641711229946
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.52      0.60       644
           1       0.34      0.54      0.41       291

    accuracy                           0.52       935
   macro avg       0.52      0.53      0.51       935
weighted avg       0.60      0.52      0.54       935

Confusion Matrix:
[[333 311]
 [134 157]]


In [9]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9850267379679144
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       644
           1       0.98      0.97      0.98       291

    accuracy                           0.99       935
   macro avg       0.98      0.98      0.98       935
weighted avg       0.99      0.99      0.99       935

Confusion Matrix:
[[638   6]
 [  8 283]]


In [10]:
#gradient boosting 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

clf = GradientBoostingClassifier(random_state = 42)

param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.1, 0.25],
    'max_depth': [3, 10],
}
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train_smote, y_train_smote)

print("Best param:", grid_search.best_params_)
print("Best accuracy (CV):", grid_search.best_score_)

best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best param: {'learning_rate': 0.25, 'max_depth': 3, 'n_estimators': 100}
Best accuracy (CV): 0.991095890410959
Accuracy: 0.9967914438502674
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       644
           1       0.99      1.00      0.99       291

    accuracy                           1.00       935
   macro avg       1.00      1.00      1.00       935
weighted avg       1.00      1.00      1.00       935

Confusion Matrix:
[[642   2]
 [  1 290]]


In [11]:
#Cat Boost Classifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

clf = CatBoostClassifier(iterations=50, learning_rate=0.2, depth=3, loss_function='MultiClass', random_state=42, verbose=0)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9914438502673797
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       644
           1       0.98      0.99      0.99       291

    accuracy                           0.99       935
   macro avg       0.99      0.99      0.99       935
weighted avg       0.99      0.99      0.99       935

Confusion Matrix:
[[638   6]
 [  2 289]]


In [12]:
#ADA boost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Accuracy: 0.9957219251336898
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       644
           1       0.99      1.00      0.99       291

    accuracy                           1.00       935
   macro avg       0.99      1.00      1.00       935
weighted avg       1.00      1.00      1.00       935

Confusion Matrix:
[[641   3]
 [  1 290]]


In [13]:
# Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

clf = ExtraTreesClassifier(random_state=42, max_depth=120, min_samples_split=2, min_samples_leaf=1, class_weight='balanced')
#берем данные с дисбалансом классов. Метод подходит для набора данных с дисбалансом
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9850267379679144
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       644
           1       0.99      0.96      0.98       291

    accuracy                           0.99       935
   macro avg       0.99      0.98      0.98       935
weighted avg       0.99      0.99      0.98       935

Confusion Matrix:
[[641   3]
 [ 11 280]]


In [14]:
#Квадратичный дискриминантный анализ. Смысла нет, но я попробовал
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train_smote.toarray(), y_train_smote)

y_pred = clf.predict(X_test.toarray())

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)



Accuracy: 0.34010695187165774
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.04      0.08       644
           1       0.32      1.00      0.49       291

    accuracy                           0.34       935
   macro avg       0.66      0.52      0.28       935
weighted avg       0.79      0.34      0.21       935



In [15]:
#Light GBM
import lightgbm as lgb

clf = lgb.LGBMClassifier(num_leaves=13, learning_rate=0.06, n_estimators=100)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


[LightGBM] [Info] Number of positive: 1460, number of negative: 1460
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198810
[LightGBM] [Info] Number of data points in the train set: 2920, number of used features: 4607
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy: 0.9957219251336898
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       644
           1       0.99      1.00      0.99       291

    accuracy                           1.00       935
   macro avg       0.99      1.00      1.00       935
weighted avg       1.00      1.00      1.00       935

Confusion Matrix:
[[641   3]
 [  1 290]]


In [16]:
# SVC
from sklearn.svm import SVC

clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train_smote, y_train_smote)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8556149732620321
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90       644
           1       0.77      0.76      0.77       291

    accuracy                           0.86       935
   macro avg       0.83      0.83      0.83       935
weighted avg       0.85      0.86      0.86       935

Confusion Matrix:
[[579  65]
 [ 70 221]]


In [17]:
#K neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
dataScalered = scaler.fit_transform(dataTransformed)

X_train_scalered, X_test_scalered, y_train_s, y_test_s = train_test_split(dataScalered, target, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
X_train_scalered_smote, y_train_scalered_smote = smote.fit_resample(X_train_scalered, y_train_s)


knn = KNeighborsClassifier(n_neighbors = 20)
knn.fit(X_train_scalered_smote, y_train_scalered_smote)

y_pred = knn.predict(X_test_scalered)

accuracy = accuracy_score(y_test_s, y_pred)
report = classification_report(y_test_s, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test_s, y_pred))

Accuracy: 0.31122994652406416
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       644
           1       0.31      1.00      0.47       291

    accuracy                           0.31       935
   macro avg       0.16      0.50      0.24       935
weighted avg       0.10      0.31      0.15       935

Confusion Matrix:
[[  0 644]
 [  0 291]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(X_train_smote, y_train_smote)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9679144385026738
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       644
           1       0.96      0.94      0.95       291

    accuracy                           0.97       935
   macro avg       0.97      0.96      0.96       935
weighted avg       0.97      0.97      0.97       935

Confusion Matrix:
[[632  12]
 [ 18 273]]


In [19]:
#Extreem gradient boosting
import xgboost as xgb

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_smote, y_train_smote)

# Оценка модели
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9967914438502674
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       644
           1       0.99      1.00      0.99       291

    accuracy                           1.00       935
   macro avg       0.99      1.00      1.00       935
weighted avg       1.00      1.00      1.00       935

Confusion Matrix:
[[641   3]
 [  0 291]]


In [20]:
#Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

base_classifiers = [
    ('svm', SVC(probability=True)),
    ('dt', DecisionTreeClassifier()),
    ('ext', ExtraTreesClassifier())
]

meta_classifier = LogisticRegression()

stack_model = StackingClassifier(estimators=base_classifiers, final_estimator=meta_classifier, cv=5)
stack_model.fit(X_train_smote, y_train_smote)
y_pred = stack_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9871657754010695
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       644
           1       0.99      0.97      0.98       291

    accuracy                           0.99       935
   macro avg       0.99      0.98      0.98       935
weighted avg       0.99      0.99      0.99       935

Confusion Matrix:
[[640   4]
 [  8 283]]
