### Импорт библиотек

In [None]:
# ИМПОРТЫ И УСТАНОВКИ
import os

import pandas as pd
import numpy as np
import re
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nlp = spacy.load("en_core_web_sm")

Note: you may need to restart the kernel to use updated packages.


### Загрузка данных для обучения и тестирования

In [2]:
dataset_path = "E:/ML_NLP/spam_ham_dataset" 

data = []  

# Проходим по папкам 1/ - 6/

for folder in os.listdir(dataset_path):  
    folder_path = os.path.join(dataset_path, folder)
    
    if os.path.isdir(folder_path):  
        for label in ['spam', 'ham']: 
            label_path = os.path.join(folder_path, label)
            
            if os.path.isdir(label_path):  
                for filename in os.listdir(label_path):  
                    file_path = os.path.join(label_path, filename)
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                            text = f.read()  
                            data.append({'text': text, 'label': label})  
                    except Exception as e:
                        print(f"Ошибка с файлом {file_path}: {e}")

# Превращаем в DataFrame

df = pd.DataFrame(data)  
df.head()  


Unnamed: 0,text,label
0,Subject: dobmeos with hgh my energy level has ...,spam
1,Subject: your prescription is ready . . oxwq s...,spam
2,Subject: get that new car 8434\npeople nowthe ...,spam
3,"Subject: await your response\ndear partner ,\n...",spam
4,"Subject: coca cola , mbna america , nascar par...",spam


### Функция предобработки текста

In [3]:
def preprocess_text(text):
    # Удаление лишних символов и 'Subject:'
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z']", ' ', text)
    text = re.sub(r'^Subject\s*:?[ ]*', '', text, flags=re.IGNORECASE)

    # Приведение к нижнему регистру и spaCy-пайплайн
    doc = nlp(text.lower())

    # Удаление стоп-слов и лемматизация
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Применяем функцию к колонке с письмами
df['text_cleaned'] = df['text'].apply(preprocess_text)

# Быстрый просмотр результата
df[['text', 'text_cleaned']].head()

Unnamed: 0,text,text_cleaned
0,Subject: dobmeos with hgh my energy level has ...,dobmeo hgh energy level go stukm introduce doc...
1,Subject: your prescription is ready . . oxwq s...,prescription ready oxwq s f e low cost prescri...
2,Subject: get that new car 8434\npeople nowthe ...,new car people nowthe weather climate particul...
3,"Subject: await your response\ndear partner ,\n...",await response dear partner team government of...
4,"Subject: coca cola , mbna america , nascar par...",coca cola mbna america nascar partner otcbb im...


### Векторизация очищенного текста

In [8]:
# Векторизация очищенного текста
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['text_cleaned'])

# Преобразование меток классов (ham → 0, spam → 1)
y = df['label'].map({'ham': 0, 'spam': 1})

### Деление на обучающую и тестовую выборки

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)