In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

import nltk
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
from nltk.tokenize import TreebankWordTokenizer

import fasttext
from sklearn.neighbors import KNeighborsClassifier

from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import evaluate
from datasets import Dataset, DatasetDict

In [None]:
spamhamdata = pd.read_csv("spamhamdata.xls", sep='\t', header=None, names=['label', 'text'])

In [None]:
spamhamdata.head()

In [None]:
spamhamdata['label'] = spamhamdata['label'].map({'ham': 0, 'spam': 1})

In [None]:
#TF-IDF + NB (наивный баес)

In [None]:
dataNB = spamhamdata.copy()

In [None]:
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    text = str(text).lower()
    words = tokenizer.tokenize(text)
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words)

In [None]:
dataNB['text_clean'] = dataNB['text'].apply(preprocess)

In [None]:
dataNB['text_clean'][42]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataNB['text_clean'], dataNB['label'], test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [None]:
y_pred = classifier.predict(X_test_tfidf)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Точность NB: {accuracy}')

In [None]:
# FastText + KNN

In [None]:
dataKNN = spamhamdata.copy()

In [None]:
dataKNN['text_clean'] = dataKNN['text'].apply(preprocess)

In [None]:
with open("ft_data.txt", "w", encoding="utf-8") as f:
    for label, text in zip(dataKNN['label'], dataKNN['text_clean']):
        f.write(f"__label__{label} {text}\n")

In [None]:
model = fasttext.train_supervised(input="ft_data.txt", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

In [None]:
X = np.array([model.get_sentence_vector(text) for text in dataKNN['text_clean']])
y = dataKNN['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_clf_pred_res = knn_clf.predict(X_test)
knn_clf_accuracy = accuracy_score(y_test, knn_clf_pred_res)

print(f'Точность KNN: {knn_clf_accuracy}')

In [None]:
# BERT + FNN

In [None]:
dataFNN = spamhamdata.copy()

In [None]:
dataFNN

In [None]:
dataset = Dataset.from_pandas(dataFNN)

train_df, test_df = train_test_split(dataFNN, test_size=0.2, stratify=dataFNN["label"], random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-finetune",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    fp16=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(f"Точность BERT: {results['eval_accuracy']}")