In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('combined_data.csv', encoding='latin-1')

df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-eng')

df = pd.read_csv('combined_data.csv', encoding='latin-1')

df = df.drop_duplicates(subset='text')
df = df.dropna()


def clean_basic(text):
    """Только нижний регистр и удаление лишних пробелов"""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_no_stopwords(text):
    """Без стоп-слов, но с пунктуацией"""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def clean_full(text):
    """Полная очистка: без пунктуации, чисел, стоп-слов"""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def clean_with_lemmatization(text):
    """Полная очистка + лемматизация"""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def clean_with_stemming(text):
    """Полная очистка + стемминг"""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

def clean_keep_special_chars(text):
    """Сохраняем специальные символы ($, !, ?), которые могут быть важны для спама"""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s\$\!\?\*\%\.]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/artemsotnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/artemsotnikov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading omw-eng: Package 'omw-eng' not found in
[nltk_data]     index


In [3]:
df['text_basic'] = df['text'].apply(clean_basic)
df['text_no_stopwords'] = df['text'].apply(clean_no_stopwords)
df['text_full'] = df['text'].apply(clean_full)
df['text_lemmatized'] = df['text'].apply(clean_with_lemmatization)
df['text_stemmed'] = df['text'].apply(clean_with_stemming)
df['text_special_chars'] = df['text'].apply(clean_keep_special_chars)

sample_idx = 10
print("Оригинал:")
print(df['text'].iloc[sample_idx])
print("\n" + "="*50 + "\n")

print("Только нижний регистр:")
print(df['text_basic'].iloc[sample_idx])
print("\n" + "-"*30 + "\n")

print("Без стоп-слов:")
print(df['text_no_stopwords'].iloc[sample_idx])
print("\n" + "-"*30 + "\n")

print("Полная очистка:")
print(df['text_full'].iloc[sample_idx])
print("\n" + "-"*30 + "\n")

print("С лемматизацией:")
print(df['text_lemmatized'].iloc[sample_idx])
print("\n" + "-"*30 + "\n")

print("Со стеммингом:")
print(df['text_stemmed'].iloc[sample_idx])
print("\n" + "-"*30 + "\n")

print("Со спецсимволами:")
print(df['text_special_chars'].iloc[sample_idx])

Оригинал:
my dear fellow do you feel insecure about your penis size you need our new improved megadik penis enlargement pills please read on did you know megadik was featured in leading mens magazines such as fhm maxim plus many others and rated no escapenumber choice for penis enlargement Â» gain escapenumber inches in length Â» increase your penis width girth by upto escapenumber Â» produce stronger rock hard erections Â» escapenumber safe to take with no side effects Â» doctor approved and recommended Â» fast shipping worldwide you have nothing to lose just a lot to gain http slasy net regards escapelong remains in escapelong use escapenumberf milliescapenumberns escapenumberf peescapenumberple in this wescapenumberrld i am escapelong man and alescapenumberng with all thescapenumberse milliescapenumberns although far from perfect especially in that it precludes a vast waldron


Только нижний регистр:
my dear fellow do you feel insecure about your penis size you need our new improved

In [4]:
df.head()

Unnamed: 0,label,text,text_basic,text_no_stopwords,text_full,text_lemmatized,text_stemmed,text_special_chars
0,1,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...,ounc feather bowl hummingbird opec moment alab...,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...,wulvob get your medircations online qnb ikud v...,wulvob get medircations online qnb ikud viagra...,wulvob get medircations online qnb ikud viagra...,wulvob get medircations online qnb ikud viagra...,wulvob get medirc onlin qnb ikud viagra escape...,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...,computer connection from cnn com wednesday esc...,computer connection cnn com wednesday escapenu...,computer connection cnn com wednesday escapenu...,computer connection cnn com wednesday escapenu...,comput connect cnn com wednesday escapenumb ma...,computer connection from cnn com wednesday esc...
3,1,university degree obtain a prosperous future m...,university degree obtain a prosperous future m...,university degree obtain prosperous future mon...,university degree obtain prosperous future mon...,university degree obtain prosperous future mon...,univers degre obtain prosper futur money earn ...,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...,thanks for all your answers guys i know i shou...,thanks answers guys know checked rsync manual ...,thanks answers guys know checked rsync manual ...,thanks answer guy know checked rsync manual wo...,thank answer guy know check rsync manual would...,thanks for all your answers guys i know i shou...


In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
import spacy
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('punkt_tab')

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

preprocess_cols = [
    'text_basic',
    'text_no_stopwords',
    'text_full',
    'text_lemmatized',
    'text_stemmed',
    'text_special_chars'
]

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM (linear)': LinearSVC(max_iter=2000, dual=False, random_state=42, C=1.0),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42)
}

embeddings = ['Word2Vec', 'spaCy', 'TF-IDF']

all_results = {}

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/artemsotnikov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/artemsotnikov/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

def run_experiment(col_name, df):
    print(f"Эксперимент для предобработки: {col_name.upper()}")

    print("Обучение Word2Vec...")
    tokens = df[col_name].apply(word_tokenize).tolist()
    w2v_model = Word2Vec(sentences=tokens, vector_size=150, window=5, min_count=2, workers=4, epochs=10)

    def avg_w2v(tokens_list):
        vecs = [w2v_model.wv[t] for t in tokens_list if t in w2v_model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(150)

    X_w2v = np.array([avg_w2v(t) for t in tokens])

    print("  spaCy векторы...")
    def safe_spacy_vector(text):
        text = str(text).strip()
        if not text:
            return np.zeros(96)
        doc = nlp(text)
        return doc.vector if len(doc.vector) > 0 else np.zeros(96)

    X_spacy = np.array([safe_spacy_vector(text) for text in df[col_name]])

    print("  TF-IDF (sparse)...")
    tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
    X_tfidf_sparse = tfidf.fit_transform(df[col_name])

    y = df['label']
    results = {}

    for emb_name, X in [('Word2Vec', X_w2v), ('spaCy', X_spacy)]:
        print(f"  → {emb_name}")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        results[emb_name] = {}

        model_list = [
            ('LogisticRegression', LogisticRegression(max_iter=1000)),
            ('SVM (linear)', LinearSVC(max_iter=2000, dual=False, random_state=42)),
            ('RandomForest', RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42))
        ]

        for name, model in model_list:
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            acc = accuracy_score(y_test, pred)
            results[emb_name][name] = acc
            print(f"     {name}: {acc:.4f}")

    print(f"  → TF-IDF (sparse)")
    X_train, X_test, y_train, y_test = train_test_split(
        X_tfidf_sparse, y, test_size=0.2, random_state=42, stratify=y
    )
    results['TF-IDF'] = {}

    tfidf_models = [
        ('LogisticRegression', LogisticRegression(max_iter=1000, solver='saga')),
        ('SVM (linear)', LinearSVC(max_iter=2000, dual=False, random_state=42))
    ]

    for name, model in tfidf_models:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred)
        results['TF-IDF'][name] = acc
        print(f"     {name}: {acc:.4f}")

    results['TF-IDF']['RandomForest'] = None

    return results

In [11]:
for col in preprocess_cols:
    all_results[col] = run_experiment(col, df)


Эксперимент для предобработки: TEXT_BASIC

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9714
     SVM (linear): 0.9703
     RandomForest: 0.9840
  → spaCy
     LogisticRegression: 0.8537
     SVM (linear): 0.8537
     RandomForest: 0.9380
  → TF-IDF (sparse)
     LogisticRegression: 0.9853
     SVM (linear): 0.9901

Эксперимент для предобработки: TEXT_NO_STOPWORDS

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9729
     SVM (linear): 0.9731
     RandomForest: 0.9860
  → spaCy
     LogisticRegression: 0.8120
     SVM (linear): 0.8116
     RandomForest: 0.9179
  → TF-IDF (sparse)
     LogisticRegression: 0.9861
     SVM (linear): 0.9884

Эксперимент для предобработки: TEXT_FULL

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9725
     SVM (linear): 0.9723
     RandomForest: 0.9838
  → spaCy
     LogisticRegression: 0.8104
     SVM (linear): 0.8091
     RandomForest: 0.8934
  → TF-IDF (sparse)
     LogisticRegression: 0.9844
     SVM (linear): 0.9876

Эксперимент для предобработки: TEXT_LEMMATIZED

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9721
     SVM (linear): 0.9711
     RandomForest: 0.9840
  → spaCy
     LogisticRegression: 0.8090
     SVM (linear): 0.8081
     RandomForest: 0.8939
  → TF-IDF (sparse)
     LogisticRegression: 0.9839
     SVM (linear): 0.9875

Эксперимент для предобработки: TEXT_STEMMED

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9711
     SVM (linear): 0.9702
     RandomForest: 0.9826
  → spaCy
     LogisticRegression: 0.7974
     SVM (linear): 0.7956
     RandomForest: 0.8809
  → TF-IDF (sparse)
     LogisticRegression: 0.9843
     SVM (linear): 0.9882

Эксперимент для предобработки: TEXT_SPECIAL_CHARS

Обучение Word2Vec...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


  spaCy векторы...
  TF-IDF (sparse)...
  → Word2Vec
     LogisticRegression: 0.9697
     SVM (linear): 0.9695
     RandomForest: 0.9851
  → spaCy
     LogisticRegression: 0.8585
     SVM (linear): 0.8575
     RandomForest: 0.9343
  → TF-IDF (sparse)
     LogisticRegression: 0.9839
     SVM (linear): 0.9898


In [14]:
print(f"{'Предобработка':<22} {'Word2Vec (best)':<18} {'spaCy (best)':<16} {'TF-IDF (best)':<16}")
print("-" * 72)

for col in preprocess_cols:
    row = f"{col:<22}"

    # Word2Vec
    w2v_accs = [v for v in all_results[col]['Word2Vec'].values() if v is not None]
    best_w2v = max(w2v_accs) if w2v_accs else 0
    row += f"{best_w2v:.4f} ({list(all_results[col]['Word2Vec'].keys())[w2v_accs.index(best_w2v)]})".ljust(18)

    # spaCy
    spacy_accs = [v for v in all_results[col]['spaCy'].values() if v is not None]
    best_spacy = max(spacy_accs) if spacy_accs else 0
    row += f"{best_spacy:.4f} ({list(all_results[col]['spaCy'].keys())[spacy_accs.index(best_spacy)]})".ljust(16)

    # TF-IDF
    tfidf_accs = [v for v in all_results[col]['TF-IDF'].values() if v is not None]
    best_tfidf = max(tfidf_accs) if tfidf_accs else 0
    best_tfidf_model = list(all_results[col]['TF-IDF'].keys())[tfidf_accs.index(best_tfidf)]
    row += f"{best_tfidf:.4f} ({best_tfidf_model})".ljust(16)

    print(row)

print("ЛУЧШИЕ РЕЗУЛЬТАТЫ ПО КАЖДОЙ ПРЕДОБРАБОТКЕ И ГЛОБАЛЬНЫЙ ПОБЕДИТЕЛЬ")

best_global_acc = 0
best_global_combo = None

for col in preprocess_cols:
    print(f"\n{col.upper()}:")
    for emb in ['Word2Vec', 'spaCy', 'TF-IDF']:
        if emb in all_results[col]:
            for model_name, acc in all_results[col][emb].items():
                if acc is not None:
                    print(f"  • {emb} + {model_name}: {acc:.4f}")
                    if acc > best_global_acc:
                        best_global_acc = acc
                        best_global_combo = (col, emb, model_name)


print(f"   {best_global_combo[0]} + {best_global_combo[1]} + {best_global_combo[2]}")
print(f"   Accuracy: {best_global_acc:.4f}")

Предобработка          Word2Vec (best)    spaCy (best)     TF-IDF (best)   
------------------------------------------------------------------------
text_basic            0.9840 (RandomForest)0.9380 (RandomForest)0.9901 (SVM (linear))
text_no_stopwords     0.9860 (RandomForest)0.9179 (RandomForest)0.9884 (SVM (linear))
text_full             0.9838 (RandomForest)0.8934 (RandomForest)0.9876 (SVM (linear))
text_lemmatized       0.9840 (RandomForest)0.8939 (RandomForest)0.9875 (SVM (linear))
text_stemmed          0.9826 (RandomForest)0.8809 (RandomForest)0.9882 (SVM (linear))
text_special_chars    0.9851 (RandomForest)0.9343 (RandomForest)0.9898 (SVM (linear))

ЛУЧШИЕ РЕЗУЛЬТАТЫ ПО КАЖДОЙ ПРЕДОБРАБОТКЕ И ГЛОБАЛЬНЫЙ ПОБЕДИТЕЛЬ

TEXT_BASIC:
  • Word2Vec + LogisticRegression: 0.9714
  • Word2Vec + SVM (linear): 0.9703
  • Word2Vec + RandomForest: 0.9840
  • spaCy + LogisticRegression: 0.8537
  • spaCy + SVM (linear): 0.8537
  • spaCy + RandomForest: 0.9380
  • TF-IDF + LogisticRegression: 0.9