In [None]:
import time

In [None]:
import pandas as pd
import numpy as np
import re
import string
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB

from sklearn.metrics import classification_report, accuracy_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, KFold, cross_val_score, train_test_split, StratifiedKFold, GridSearchCV

import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim.models
import fasttext

In [None]:
%%capture
!python3 -m spacy download en_core_web_sm

In [None]:
%%capture
!pip install -q gensim
!pip install -q fasttext

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
data = pd.read_csv("spam_or_not_spam.csv")
data.style.set_properties(**{'text-align': 'left'})
data.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


Проведем предобработку:

In [None]:
data.drop(data[data.email == ' '].index, inplace=True)
data.drop(data[data['email'].isnull()].index, inplace=True)

In [None]:
data = data.reset_index(drop=True)

In [None]:
%%time
data['cleaned_text'] = data['email'].str.replace('NUMBER', '') # удаляем слово "NUMBER" из текста, так как оно заменяет собой все числа и встречается чаще любого другого слова
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join(token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct # удаление пунктуации
        and not token.is_digit # удаление цифр
        and not token.like_email # удаление почтовых адресов
        and not token.like_num # удаление чисел, в том числе в виде текста
        and not token.is_space # удаление пробельных символов
    )
)

data.sample(3, random_state=1)

CPU times: user 2min 13s, sys: 1.23 s, total: 2min 15s
Wall time: 2min 24s


Unnamed: 0,email,label,cleaned_text
748,at NUMBER NUMBER pm NUMBER on NUMBER NUMBER NU...,0,pm tom write green say spot owl hadn t exist i...
2881,pocket the newest NUMBER year annuity pocket ...,1,pocket new year annuity pocket new year annuit...
1391,justin mason jm jmason org NUMBER NUMBER NUMBE...,0,justin mason jm jmason org point aim rescore a...


In [None]:
data['text_tokens'] = data['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x))
data.head()

Unnamed: 0,email,label,cleaned_text,text_tokens
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,date d aug chris garrigues cwg date fad deeped...,"[date, aug, chris, garrigues, cwg, date, fad, ..."
1,martin a posted tassos papadopoulos the greek ...,0,martin post tassos papadopoulo greek sculptor ...,"[martin, post, tassos, papadopoulo, greek, scu..."
2,man threatens explosion in moscow thursday aug...,0,man threaten explosion moscow thursday august ...,"[man, threaten, explosion, moscow, thursday, a..."
3,klez the virus that won t die already the most...,0,klez virus win t die prolific virus klez conti...,"[klez, virus, win, die, prolific, virus, klez,..."
4,in adding cream to spaghetti carbonara which ...,0,add cream spaghetti carbonara effect pasta mak...,"[add, cream, spaghetti, carbonara, effect, pas..."


#Skip-gram

In [None]:
%time
model_skipgram = gensim.models.Word2Vec(data['text_tokens'], sg=1, vector_size=250, window=7, min_count=5, epochs=25, seed=24, workers=4)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.34 µs


In [None]:
vocab = list(model_skipgram.wv.key_to_index.keys())
print(len(vocab))

7344


In [None]:
model_skipgram.wv.most_similar(positive=['old'], topn=5)

[('tired', 0.4354839026927948),
 ('fashioned', 0.4057896137237549),
 ('sitescooper', 0.3701004087924957),
 ('osdn', 0.3610416054725647),
 ('alley', 0.34120550751686096)]

In [None]:
model_skipgram.wv.most_similar("work", topn=5)

[('sdl', 0.3624294400215149),
 ('charm', 0.35912784934043884),
 ('gratis', 0.35736754536628723),
 ('defuse', 0.3547976016998291),
 ('greedy', 0.34491950273513794)]

In [None]:
model_skipgram.wv.most_similar(positive=['word'], topn=10)

[('oshiwambo', 0.4756561815738678),
 ('hebrew', 0.4335041046142578),
 ('powerpoint', 0.41688233613967896),
 ('supremacist', 0.36844584345817566),
 ('probability', 0.36550742387771606),
 ('problematic', 0.36171770095825195),
 ('manuscript', 0.3571861684322357),
 ('marital', 0.35689008235931396),
 ('translate', 0.35241132974624634),
 ('admission', 0.35056623816490173)]

In [None]:
model_skipgram.wv.similarity('moscow', 'city')

0.15725909

In [None]:
model_skipgram.wv.similarity('moscow', 'world')

0.18526623

In [None]:
model_skipgram.wv.similarity('url', 'hyperlink')

0.18406992

#CBOW

In [None]:
%time
model_cbow = gensim.models.Word2Vec(data['text_tokens'], vector_size=250, window=5, min_count=10, sg=0, negative=5, epochs=25, seed=24, workers=4)

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 7.15 µs


In [None]:
vocab = list(model_cbow.wv.key_to_index.keys())
print(len(vocab))

In [None]:
model_cbow.wv.most_similar(positive=['word'], topn=5)

[('hebrew', 0.5478679537773132),
 ('translation', 0.5221811532974243),
 ('translate', 0.5070224404335022),
 ('meaning', 0.4822438955307007),
 ('english', 0.43984872102737427)]

In [None]:
similar_words = model_cbow.wv.most_similar("work", topn=10)
similar_words

[('fine', 0.35781577229499817),
 ('instal', 0.313087522983551),
 ('libdv', 0.3058166205883026),
 ('vanilla', 0.3002563714981079),
 ('wouldn', 0.2934785783290863),
 ('folk', 0.29119470715522766),
 ('sdl', 0.2906181216239929),
 ('honesty', 0.2893317937850952),
 ('roll', 0.2866627275943756),
 ('ssh', 0.27963852882385254)]

In [None]:
model_cbow.wv.similarity('world', 'city')

0.15326917

In [None]:
model_skipgram.wv.similarity('url', 'hyperlink')

0.2155585

#FastText

In [None]:
with open('data.txt', 'w') as f:
  for i in range(data.shape[0]):
    f.write(' '.join(data['text_tokens'][i]))

In [None]:
%%time
model_ft = fasttext.train_unsupervised('data.txt', wordNgrams=3, dim=300, ws=7, minCount=10, epoch=10, thread=4)

CPU times: user 7min 30s, sys: 2.66 s, total: 7min 33s
Wall time: 4min 21s


In [None]:
len(model_ft.words)

4487

In [None]:
model_ft.get_nearest_neighbors('work')

[(0.39271971583366394, 'network'),
 (0.3888806104660034, 'working'),
 (0.3882054388523102, 'world'),
 (0.38391903042793274, 'worry'),
 (0.36031854152679443, 'worker'),
 (0.34090742468833923, 'worried'),
 (0.33292123675346375, 'framework'),
 (0.33260706067085266, 'won'),
 (0.32903751730918884, 'word'),
 (0.3258245587348938, 'try')]

In [None]:
model_ft.get_nearest_neighbors('word')

[(0.4562312662601471, 'keyword'),
 (0.4219526946544647, 'wordinfo'),
 (0.41403719782829285, 'hebrew'),
 (0.39379000663757324, 'worried'),
 (0.3808000981807709, 'iiu'),
 (0.3730248510837555, 'doc'),
 (0.3587852716445923, 'mord'),
 (0.3545278310775757, 'meaning'),
 (0.353466659784317, 'translate'),
 (0.3468928635120392, 'password')]

Получим усредненные эмбеддинги для предложений:

In [None]:
token_vectors_skipgram = [np.array([model_skipgram.wv[token] for token in text if token in model_skipgram.wv]) for text in data['text_tokens']]
token_vectors_cbow = [np.array([model_cbow.wv[token] for token in text if token in model_cbow.wv]) for text in data['text_tokens']]
token_vectors_ft = [np.array([model_ft.get_word_vector(token) for token in text if token in model_ft]) for text in data['text_tokens']]

In [None]:
mean_vectors_skipgram = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_skipgram]
mean_vectors_cbow = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_cbow]
mean_vectors_ft = [vectors.mean(axis=0) if vectors.size else np.zeros(250, float) for vectors in token_vectors_ft]

In [None]:
X_SkipGram = mean_vectors_skipgram
X_CBOW = mean_vectors_cbow
X_FT = mean_vectors_ft

y = data['label']

Разделим данные и обучим модели:

In [None]:
X_train_SkipGram, X_test_SkipGram, y_train_SkipGram, y_test_SkipGram = train_test_split(X_SkipGram, y, shuffle=True, test_size = 0.7, random_state=24)
X_train_CBOW, X_test_CBOW, y_train_CBOW, y_test_CBOW = train_test_split(X_CBOW, y, shuffle=True, test_size = 0.7, random_state=24)
X_train_fastText, X_test_fastText, y_train_fastText, y_test_fastText = train_test_split(X_FT, y, shuffle=True, test_size = 0.7, random_state=24)

In [None]:
y_test_SkipGram.value_counts()

0    1741
1     357
Name: label, dtype: int64

In [None]:
lr_SkipGram = lr_CBOW = lr_fastText = LogisticRegression(C=10)
lr_CBOW = LogisticRegression(C=10)
lr_fastText = LogisticRegression(C=10)

In [None]:
lr_SkipGram.fit(X_train_SkipGram, y_train_SkipGram);
lr_CBOW.fit(X_train_CBOW, y_train_CBOW);
lr_fastText.fit(X_train_fastText, y_train_fastText);

In [None]:
y_pred_SkipGram = lr_SkipGram.predict(X_test_SkipGram)

accuracy = accuracy_score(y_test_SkipGram, y_pred_SkipGram)
recall = recall_score(y_test_SkipGram, y_pred_SkipGram)

print(classification_report(y_test_SkipGram, y_pred_SkipGram))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1741
           1       0.97      0.93      0.95       357

    accuracy                           0.98      2098
   macro avg       0.98      0.96      0.97      2098
weighted avg       0.98      0.98      0.98      2098



In [None]:
y_pred_CBOW = lr_CBOW.predict(X_test_CBOW)

accuracy = accuracy_score(y_test_CBOW, y_pred_CBOW)
recall = recall_score(y_test_CBOW, y_pred_CBOW)

print(classification_report(y_test_CBOW, y_pred_CBOW))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1741
           1       0.95      0.94      0.94       357

    accuracy                           0.98      2098
   macro avg       0.97      0.96      0.96      2098
weighted avg       0.98      0.98      0.98      2098



In [None]:
y_pred_fastText = lr_fastText.predict(X_test_fastText)

accuracy = accuracy_score(y_test_fastText, y_pred_fastText)
recall = recall_score(y_test_fastText, y_pred_fastText)

print(classification_report(y_test_fastText, y_pred_fastText))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1741
           1       0.99      0.96      0.97       357

    accuracy                           0.99      2098
   macro avg       0.99      0.98      0.98      2098
weighted avg       0.99      0.99      0.99      2098

