In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

import nltk
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
from nltk.tokenize import TreebankWordTokenizer

import fasttext
from sklearn.neighbors import KNeighborsClassifier

from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier




In [2]:
spamhamdata = pd.read_csv("spamhamdata.csv", sep='\t', header=None, names=['label', 'text'])

In [3]:
spamhamdata.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spamhamdata['label'] = spamhamdata['label'].map({'ham': 0, 'spam': 1})

In [5]:
#TF-IDF + NB (наивный баес)

In [6]:
dataNB = spamhamdata.copy()

In [7]:
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    text = str(text).lower()
    words = tokenizer.tokenize(text)
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words) 

In [8]:
dataNB['text_clean'] = dataNB['text'].apply(preprocess)

In [9]:
dataNB['text_clean'][42]

'07732584351 rodger burns msg tried call reply sms free nokia mobile free camcorder. please call 08000930705 delivery tomorrow'

In [10]:
X_train, X_test, y_train, y_test = train_test_split(dataNB['text_clean'], dataNB['label'], test_size=0.2, random_state=42)

In [11]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [12]:
y_pred = classifier.predict(X_test_tfidf)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Точность NB: {accuracy}')

Точность NB: 0.979372197309417


In [14]:
# FastText + KNN

In [15]:
dataKNN = spamhamdata.copy()

In [16]:
dataKNN['text_clean'] = dataKNN['text'].apply(preprocess)

In [17]:
dataKNN['text_clean'][42]

'07732584351 rodger burns msg tried call reply sms free nokia mobile free camcorder. please call 08000930705 delivery tomorrow'

In [18]:
with open("ft_data.txt", "w", encoding="utf-8") as f:
    for label, text in zip(dataKNN['label'], dataKNN['text_clean']):
        f.write(f"__label__{label} {text}\n")

In [19]:
model = fasttext.train_supervised(input="ft_data.txt", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

In [20]:
X = np.array([model.get_sentence_vector(text) for text in dataKNN['text_clean']])
y = dataKNN['label'].values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_clf_pred_res = knn_clf.predict(X_test)
knn_clf_accuracy = accuracy_score(y_test, knn_clf_pred_res)

print(f'Точность KNN: {knn_clf_accuracy}')

Точность KNN: 1.0


In [23]:
# BERT + FNN

In [24]:
dataFNN = spamhamdata.copy()

In [25]:
dataFNN['text_clean'] = dataFNN['text'].apply(preprocess)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(dataFNN['text'], dataFNN['label'], test_size=0.2, random_state=42)

In [27]:
model = SentenceTransformer('bert-base-uncased')

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [28]:
X_train_emb = model.encode(X_train.tolist(), convert_to_tensor=True, show_progress_bar=True)
X_test_emb = model.encode(X_test.tolist(), convert_to_tensor=True, show_progress_bar=True)

Batches: 100%|███████████████████████████████████████████████████████████████████████| 140/140 [02:37<00:00,  1.12s/it]
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 35/35 [00:39<00:00,  1.14s/it]


In [29]:
fnn_clf = MLPClassifier(hidden_layer_sizes=(5,), max_iter=3000, random_state=42)
fnn_clf.fit(X_train_emb, y_train)
fnn_clf_pred_res = fnn_clf.predict(X_test_emb)
fnn_clf_accuracy = accuracy_score(y_test, fnn_clf_pred_res)

print(f'Точность FNN: {fnn_clf_accuracy}')

Точность FNN: 0.9919282511210762
