In [3]:
%%capture
!pip install -q gensim
!pip install -q fasttext

In [4]:
%%capture
!python3 -m spacy download en_core_web_sm

In [1]:
import time

In [5]:
import pandas as pd
import numpy as np
import re
import string
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB

from sklearn.metrics import classification_report, accuracy_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, KFold, cross_val_score, train_test_split, StratifiedKFold, GridSearchCV

import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim.models
import fasttext

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
data = pd.read_csv("spam_or_not_spam.csv")
data.style.set_properties(**{'text-align': 'left'})
data.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


Проведем предобработку:

In [8]:
data.drop(data[data.email == ' '].index, inplace=True)
data.drop(data[data['email'].isnull()].index, inplace=True)

In [9]:
data = data.reset_index(drop=True)

In [10]:
%%time
data['cleaned_text'] = data['email'].str.replace('NUMBER', '') # удаляем слово "NUMBER" из текста, так как оно заменяет собой все числа и встречается чаще любого другого слова
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join(token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct # удаление пунктуации
        and not token.is_digit # удаление цифр
        and not token.like_email # удаление почтовых адресов
        and not token.like_num # удаление чисел, в том числе в виде текста
        and not token.is_space # удаление пробельных символов
    )
)

data.sample(3, random_state=1)

CPU times: user 1min 55s, sys: 964 ms, total: 1min 56s
Wall time: 2min 3s


Unnamed: 0,email,label,cleaned_text
748,at NUMBER NUMBER pm NUMBER on NUMBER NUMBER NU...,0,pm tom write green say spot owl hadn t exist i...
2881,pocket the newest NUMBER year annuity pocket ...,1,pocket new year annuity pocket new year annuit...
1391,justin mason jm jmason org NUMBER NUMBER NUMBE...,0,justin mason jm jmason org point aim rescore a...


In [11]:
data['text_tokens'] = data['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x))
data.head()

Unnamed: 0,email,label,cleaned_text,text_tokens
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,date d aug chris garrigues cwg date fad deeped...,"[date, aug, chris, garrigues, cwg, date, fad, ..."
1,martin a posted tassos papadopoulos the greek ...,0,martin post tassos papadopoulo greek sculptor ...,"[martin, post, tassos, papadopoulo, greek, scu..."
2,man threatens explosion in moscow thursday aug...,0,man threaten explosion moscow thursday august ...,"[man, threaten, explosion, moscow, thursday, a..."
3,klez the virus that won t die already the most...,0,klez virus win t die prolific virus klez conti...,"[klez, virus, win, die, prolific, virus, klez,..."
4,in adding cream to spaghetti carbonara which ...,0,add cream spaghetti carbonara effect pasta mak...,"[add, cream, spaghetti, carbonara, effect, pas..."


Разделим данные:

In [145]:
X_train, X_test, y_train, y_test = train_test_split(data['text_tokens'], data['label'], shuffle=True, random_state=1234, train_size=0.7)

Обучим эмбеддинги:

#Skip-gram

In [146]:
%%time
model_skipgram = gensim.models.Word2Vec(X_train, sg=1, vector_size=250, window=7, min_count=10, epochs=40, seed=24, workers=4)

CPU times: user 2min 31s, sys: 380 ms, total: 2min 31s
Wall time: 1min 29s


In [147]:
vocab = list(model_skipgram.wv.key_to_index.keys())
print(len(vocab))

3678


In [148]:
model_skipgram.wv.most_similar(positive=['old'], topn=5)

[('tired', 0.3309100568294525),
 ('osdn', 0.31591665744781494),
 ('nightly', 0.2856985628604889),
 ('thai', 0.2796338200569153),
 ('nonspam', 0.27530238032341003)]

In [149]:
model_skipgram.wv.most_similar("work", topn=5)

[('roi', 0.3416286110877991),
 ('lirc', 0.319993257522583),
 ('useless', 0.30216652154922485),
 ('libdv', 0.2989464998245239),
 ('alter', 0.29254838824272156)]

In [150]:
model_skipgram.wv.most_similar(positive=['october'], topn=10)

[('tuesday', 0.5534021854400635),
 ('gnat', 0.4340027868747711),
 ('friday', 0.3982687294483185),
 ('pudge', 0.38874340057373047),
 ('august', 0.3527173399925232),
 ('monday', 0.3491170406341553),
 ('km', 0.3337840735912323),
 ('wednesday', 0.33151546120643616),
 ('lemann', 0.3266523778438568),
 ('sunday', 0.31926751136779785)]

In [151]:
model_skipgram.wv.similarity('world', 'city')

0.08855083

In [152]:
model_skipgram.wv.similarity('airport', 'city')

0.3152705

In [153]:
model_skipgram.wv.similarity('url', 'hyperlink')

0.2004054

#CBOW

In [154]:
%%time
model_cbow = gensim.models.Word2Vec(X_train, vector_size=250, window=5, min_count=10, sg=0, negative=5, epochs=40, seed=24, workers=4)

CPU times: user 32.1 s, sys: 218 ms, total: 32.4 s
Wall time: 22.6 s


In [155]:
vocab = list(model_cbow.wv.key_to_index.keys())
print(len(vocab))

3678


In [156]:
model_cbow.wv.most_similar(positive=['word'], topn=5)

[('translate', 0.4532121419906616),
 ('translation', 0.4048611521720886),
 ('commandment', 0.3919045031070709),
 ('meaning', 0.3912849724292755),
 ('evolve', 0.3797954320907593)]

In [157]:
similar_words = model_cbow.wv.most_similar("october", topn=10)
similar_words

[('tuesday', 0.8303964734077454),
 ('august', 0.7094030380249023),
 ('friday', 0.6627829074859619),
 ('pudge', 0.6449451446533203),
 ('wednesday', 0.636072039604187),
 ('monday', 0.6075277328491211),
 ('thursday', 0.592248797416687),
 ('december', 0.5869767069816589),
 ('september', 0.5831322073936462),
 ('gnat', 0.5738017559051514)]

In [158]:
model_cbow.wv.similarity('world', 'city')

0.13345008

In [159]:
model_skipgram.wv.similarity('url', 'hyperlink')

0.2004054

#FastText

In [160]:
with open('data.txt', 'w') as f:
  for i in X_train.index.to_list():
    f.write(' '.join(X_train[i]))

In [161]:
%%time
model_ft = fasttext.train_unsupervised('data.txt', wordNgrams=3, dim=300, ws=7, minCount=10, epoch=40, thread=4)

CPU times: user 18min 53s, sys: 3.49 s, total: 18min 57s
Wall time: 10min 52s


In [162]:
len(model_ft.words)

3665

In [163]:
model_ft.get_nearest_neighbors('work')

[(0.3664908707141876, 'way'),
 (0.3495029807090759, 'lirc'),
 (0.31936824321746826, 'transcode'),
 (0.3186422884464264, 'libdv'),
 (0.3175836503505707, 'xine'),
 (0.3103780448436737, 'source'),
 (0.3067995011806488, 'thing'),
 (0.3060365319252014, 'look'),
 (0.3028355538845062, 'maybe'),
 (0.29606249928474426, 'want')]

In [164]:
model_ft.get_nearest_neighbors('word')

[(0.35667896270751953, 'commandment'),
 (0.32435011863708496, 'meaning'),
 (0.2872738242149353, 'exact'),
 (0.2867744565010071, 'grep'),
 (0.2755102217197418, 'translation'),
 (0.2742385268211365, 'keyword'),
 (0.2711973190307617, 'translate'),
 (0.2665921449661255, 'milter'),
 (0.25998732447624207, 'mord'),
 (0.2530663311481476, 'parrot')]

In [165]:
model_ft.get_nearest_neighbors('october')

[(0.5577356219291687, 'tuesday'),
 (0.37248674035072327, 'gnat'),
 (0.33203431963920593, 'pudge'),
 (0.319052517414093, 'lemann'),
 (0.3123582899570465, 'september'),
 (0.31142935156822205, 'monday'),
 (0.3104938566684723, 'wednesday'),
 (0.30537980794906616, 'august'),
 (0.29852041602134705, 'friday'),
 (0.27601054310798645, 'km')]

Получим усредненные эмбеддинги для предложений:

In [166]:
token_vectors_skipgram_train = [np.array([model_skipgram.wv[token] for token in text if token in model_skipgram.wv]) for text in X_train]
token_vectors_cbow_train = [np.array([model_cbow.wv[token] for token in text if token in model_cbow.wv]) for text in X_train]
token_vectors_ft_train = [np.array([model_ft.get_word_vector(token) for token in text if token in model_ft]) for text in X_train]

In [167]:
token_vectors_skipgram_test = [np.array([model_skipgram.wv[token] for token in text if token in model_skipgram.wv]) for text in X_test]
token_vectors_cbow_test = [np.array([model_cbow.wv[token] for token in text if token in model_cbow.wv]) for text in X_test]
token_vectors_ft_test = [np.array([model_ft.get_word_vector(token) for token in text if token in model_ft]) for text in X_test]

In [168]:
mean_vectors_skipgram_train = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_skipgram_train]
mean_vectors_cbow_train = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_cbow_train]
mean_vectors_ft_train = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_ft_train]

In [169]:
mean_vectors_skipgram_test = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_skipgram_test]
mean_vectors_cbow_test = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_cbow_test]
mean_vectors_ft_test = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_ft_test]

In [170]:
X_train_SkipGram = mean_vectors_skipgram_train
X_train_CBOW = mean_vectors_cbow_train
X_train_fastText = mean_vectors_ft_train

X_test_SkipGram = mean_vectors_skipgram_test
X_test_CBOW = mean_vectors_cbow_test
X_test_fastText = mean_vectors_ft_test

Обучим модели:

In [171]:
lr_SkipGram = LogisticRegression(C=10, solver='lbfgs', max_iter=200)
lr_CBOW = LogisticRegression(C=10, solver='lbfgs', max_iter=200)
lr_fastText = LogisticRegression(C=10, solver='lbfgs', max_iter=200)

In [172]:
lr_SkipGram.fit(X_train_SkipGram, y_train);
lr_CBOW.fit(X_train_CBOW, y_train);
lr_fastText.fit(X_train_fastText, y_train);

In [173]:
y_pred_SkipGram = lr_SkipGram.predict(X_test_SkipGram)

accuracy = accuracy_score(y_test, y_pred_SkipGram)
recall = recall_score(y_test, y_pred_SkipGram)

print(classification_report(y_test, y_pred_SkipGram))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       748
           1       0.98      0.93      0.96       152

    accuracy                           0.99       900
   macro avg       0.98      0.97      0.97       900
weighted avg       0.99      0.99      0.99       900



In [174]:
y_pred_CBOW = lr_CBOW.predict(X_test_CBOW)

accuracy = accuracy_score(y_test, y_pred_CBOW)
recall = recall_score(y_test, y_pred_CBOW)

print(classification_report(y_test, y_pred_CBOW))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       748
           1       0.97      0.97      0.97       152

    accuracy                           0.99       900
   macro avg       0.98      0.98      0.98       900
weighted avg       0.99      0.99      0.99       900



In [175]:
y_pred_fastText = lr_fastText.predict(X_test_fastText)

accuracy = accuracy_score(y_test, y_pred_fastText)
recall = recall_score(y_test, y_pred_fastText)

print(classification_report(y_test, y_pred_fastText))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       748
           1       0.97      0.93      0.95       152

    accuracy                           0.98       900
   macro avg       0.98      0.96      0.97       900
weighted avg       0.98      0.98      0.98       900

