In [1]:
import torch
torch.cuda.is_available()

True

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

from datasets import Dataset, load_dataset


import transformers
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer

from catboost import CatBoostClassifier

In [3]:
train_df = pd.read_csv('train_spam.csv')
test_df = pd.read_csv('test_spam.csv')

In [4]:
train_df['text_type'].value_counts()

text_type
ham     11469
spam     4809
Name: count, dtype: int64

In [5]:
train_df

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


In [6]:
train_df['text'].isna().sum()

0

In [7]:
test_df['text'].isna().sum()

0

In [8]:
train_df['label'] = train_df['text_type'].map({'spam' : 1, 'ham' : 0})

In [10]:
train_df[['text', 'label']].to_csv('train_spam_labeled.csv', index=False)

# Предсказание только нулей

In [11]:
train_df = pd.read_csv('train_spam_labeled.csv')

In [12]:
print('Accuracy: {}'.format(accuracy_score(train_df['label'], [0]*train_df.shape[0])))
print('ROC-AUC: {}'.format(roc_auc_score(train_df['label'], [0]*train_df.shape[0])))

Accuracy: 0.7045705860670844
ROC-AUC: 0.5


# Наивный байесовский классификатор

In [32]:
train_df = pd.read_csv('train_spam_labeled.csv')

In [33]:
X = np.array(train_df['text'])
y = np.array(train_df['label'])

cv = CountVectorizer()
X = cv.fit_transform(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                test_size=0.2, 
                                                random_state=42)

model = BernoulliNB()
model.fit(X_train, y_train)
preds = model.predict(X_val)
preds_train = model.predict(X_train)

In [34]:
print('Accuracy train: {}'.format(accuracy_score(preds_train, y_train)))
print('ROC-AUC train: {}'.format(roc_auc_score(preds_train, y_train)))

print('\nAccuracy val: {}'.format(accuracy_score(preds, y_val)))
print('ROC-AUC val: {}'.format(roc_auc_score(preds, y_val)))

Accuracy train: 0.959299646751651
ROC-AUC train: 0.969308190031278

Accuracy val: 0.9333538083538083
ROC-AUC val: 0.9401154401154401


 # CatBoost text_features

In [73]:
train_df = pd.read_csv('train_spam_labeled.csv')

In [74]:
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['label'], test_size=0.25)

In [75]:
X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)

In [88]:
classifier = CatBoostClassifier(
    iterations=1000,
    depth=6
)

In [92]:
classifier.fit(pd.DataFrame(X_train), y_train, text_features=['text'], verbose=100)

Learning rate set to 0.029987
0:	learn: 0.6586587	total: 34.5ms	remaining: 34.4s
100:	learn: 0.1598501	total: 4.76s	remaining: 42.4s
200:	learn: 0.1433659	total: 9.11s	remaining: 36.2s
300:	learn: 0.1340389	total: 13.4s	remaining: 31.2s
400:	learn: 0.1244669	total: 17.7s	remaining: 26.4s
500:	learn: 0.1164736	total: 22.1s	remaining: 22s
600:	learn: 0.1099377	total: 26.4s	remaining: 17.5s
700:	learn: 0.1041181	total: 30.8s	remaining: 13.1s
800:	learn: 0.0991969	total: 35.2s	remaining: 8.74s
900:	learn: 0.0951632	total: 39.7s	remaining: 4.37s
999:	learn: 0.0915061	total: 44.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x20f03070400>

In [93]:
preds = classifier.predict(X_val)
preds_train = classifier.predict(X_train)

In [94]:
print('Accuracy train: {}'.format(accuracy_score(preds_train, y_train)))
print('ROC-AUC train: {}'.format(roc_auc_score(preds_train, y_train)))

print('\nAccuracy val: {}'.format(accuracy_score(preds, y_val)))
print('ROC-AUC val: {}'.format(roc_auc_score(preds, y_val)))

Accuracy train: 0.9810779816513762
ROC-AUC train: 0.9809947081195941

Accuracy val: 0.957985257985258
ROC-AUC val: 0.9561640588245331


 # Distilbert

In [50]:
raw_dataset = load_dataset('csv', data_files='train_spam_labeled.csv', column_names=['text', 'labels'], skiprows=1)

Generating train split: 0 examples [00:00, ? examples/s]

In [51]:
dataset = raw_dataset['train'].train_test_split(test_size=0.2)

In [52]:
len(dataset['train']), len(dataset['test'])

(13022, 3256)

In [53]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [54]:
train_tok = dataset['train'].map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=512), batched=True)
test_tok = dataset['test'].map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=512), batched=True)

Map:   0%|          | 0/13022 [00:00<?, ? examples/s]

Map:   0%|          | 0/3256 [00:00<?, ? examples/s]

In [55]:
train_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [56]:
# model = AutoModelForSequenceClassification.from_pretrained('model1')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [57]:
model.to('cuda');

In [58]:
training_args = TrainingArguments(
    'test-trainer',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate=2e-5,
    weight_decay = 0.01
)

trainer = Trainer(
    model,
    training_args,
    train_dataset = train_tok,
    eval_dataset = test_tok,
    tokenizer = tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

Step,Training Loss
500,0.1891
1000,0.0977
1500,0.0594
2000,0.0347
2500,0.0211
3000,0.0135
3500,0.005
4000,0.0035


Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=4070, training_loss=0.05229256150177714, metrics={'train_runtime': 11932.6118, 'train_samples_per_second': 5.456, 'train_steps_per_second': 0.341, 'total_flos': 8624952326492160.0, 'train_loss': 0.05229256150177714, 'epoch': 5.0})

In [19]:
trainer.save_model('model1')

In [59]:
predictions = trainer.predict(test_tok)
preds = np.argmax(predictions.predictions, axis=-1)

In [60]:
predictions_train = trainer.predict(train_tok)
preds_train = np.argmax(predictions_train.predictions, axis=-1)

In [62]:
print('Accuracy train: {}'.format(accuracy_score(preds_train, train_tok['labels'])))
print('ROC-AUC train: {}'.format(roc_auc_score(preds_train, train_tok['labels'])))
print('\nAccuracy val: {}'.format(accuracy_score(preds, test_tok['labels'])))
print('ROC-AUC val: {}'.format(roc_auc_score(preds, test_tok['labels'])))

Accuracy train: 0.9939333435724159
ROC-AUC train: 0.9936336634597254

Accuracy val: 0.9938574938574939
ROC-AUC val: 0.9935458776058693


Distilbert показал наибольший ROC-AUC на валидационной выборке, так что для скоринга тестовой выборки применим его, хотя при обучении и на инфересне трансформерная модель работает гораздо дольше чем остальные кондидаты.

In [63]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [64]:
model = AutoModelForSequenceClassification.from_pretrained('model1');
trainer = Trainer(
    model,
    tokenizer = tokenizer
);

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [65]:
raw_dataset = load_dataset('csv', data_files='test_spam.csv', column_names=['text'], skiprows=1)

In [66]:
set_tok = raw_dataset['train'].map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=512), batched=True)

In [67]:
set_tok.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [68]:
test_preds = trainer.predict(set_tok)
test_preds = np.argmax(test_preds.predictions, axis=-1)

In [69]:
test_df['score'] = test_preds

In [70]:
test_df['score'] = test_df['score'].map({1 : 'spam', 0 : 'ham'})

In [71]:
test_df

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,ham
1,original message from bitbitch magnesium net p...,ham
2,java for managers vince durasoft who just taug...,ham
3,there is a youtuber name saiman says,ham
4,underpriced issue with high return on equity t...,spam
...,...,...
4065,husband to wifetum meri zindagi hoorwifeor kya...,ham
4066,baylor enron case study cindy yes i shall co a...,ham
4067,boring as compared to tp,ham
4068,hellogorgeous hows u my fone was on charge lst...,ham


In [72]:
test_df[['score', 'text']].to_csv('test_scores.csv', index=False)