In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

import nltk
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
from nltk.tokenize import TreebankWordTokenizer

import fasttext
from sklearn.neighbors import KNeighborsClassifier

from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import evaluate
from datasets import Dataset, DatasetDict

In [20]:
spamhamdata = pd.read_csv("spamhamdata.xls", sep='\t', header=None, names=['label', 'text'])

In [21]:
spamhamdata.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
spamhamdata['label'] = spamhamdata['label'].map({'ham': 0, 'spam': 1})

In [23]:
#TF-IDF + NB (наивный баес)

In [24]:
dataNB = spamhamdata.copy()

In [25]:
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    text = str(text).lower()
    words = tokenizer.tokenize(text)
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words)

In [26]:
dataNB['text_clean'] = dataNB['text'].apply(preprocess)

In [27]:
dataNB['text_clean'][42]

'07732584351 rodger burns msg tried call reply sms free nokia mobile free camcorder. please call 08000930705 delivery tomorrow'

In [28]:
X_train, X_test, y_train, y_test = train_test_split(dataNB['text_clean'], dataNB['label'], test_size=0.2, random_state=42)

In [29]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [30]:
y_pred = classifier.predict(X_test_tfidf)

In [31]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Точность NB: {accuracy}')

Точность NB: 0.979372197309417


In [32]:
# FastText + KNN

In [33]:
dataKNN = spamhamdata.copy()

In [34]:
dataKNN['text_clean'] = dataKNN['text'].apply(preprocess)

In [35]:
with open("ft_data.txt", "w", encoding="utf-8") as f:
    for label, text in zip(dataKNN['label'], dataKNN['text_clean']):
        f.write(f"__label__{label} {text}\n")

In [36]:
model = fasttext.train_supervised(input="ft_data.txt", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova')

In [37]:
X = np.array([model.get_sentence_vector(text) for text in dataKNN['text_clean']])
y = dataKNN['label'].values

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_clf_pred_res = knn_clf.predict(X_test)
knn_clf_accuracy = accuracy_score(y_test, knn_clf_pred_res)

print(f'Точность KNN: {knn_clf_accuracy}')

Точность KNN: 1.0


In [40]:
# BERT + FNN

In [41]:
dataFNN = spamhamdata.copy()

In [42]:
dataFNN

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [43]:
dataset = Dataset.from_pandas(dataFNN)

train_df, test_df = train_test_split(dataFNN, test_size=0.2, stratify=dataFNN["label"], random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [44]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
training_args = TrainingArguments(
    output_dir="./bert-finetune",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    fp16=True,
    report_to="none"
)

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss
10,0.4545
20,0.2556
30,0.1188
40,0.1088
50,0.0601
60,0.094
70,0.0486
80,0.0675
90,0.0277
100,0.03


TrainOutput(global_step=837, training_loss=0.030979045405271802, metrics={'train_runtime': 179.2838, 'train_samples_per_second': 74.58, 'train_steps_per_second': 4.669, 'total_flos': 429273395151540.0, 'train_loss': 0.030979045405271802, 'epoch': 3.0})

In [47]:
results = trainer.evaluate()
print(f"Точность BERT: {results['eval_accuracy']}")

Точность BERT: 0.9919282511210762
