# Разработка классификатора для определения спам/не спам сообщений

**Цель:** Разработать классификатор для определения спам/не спам сообщений с использованием различных методов предобработки текста и векторизации.

## 1. Подготовка данных

In [None]:
!pip install nltk



In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from itertools import product

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
url = 'https://drive.google.com/file/d/1kH6XOSCdI1FDshdSUQQztjDyLJG4umZF/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
data = pd.read_csv(url, encoding='latin1')

In [None]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
data[(data['Unnamed: 2'].isna()==False) |
     (data['Unnamed: 3'].isna()==False) |
     (data['Unnamed: 4'].isna()==False)].head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
444,ham,\HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYAR...,HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...,,
671,spam,SMS. ac sun0819 posts HELLO:\You seem cool,"wanted to say hi. HI!!!\"" Stop? Send STOP to ...",,
710,ham,Height of Confidence: All the Aeronautics prof...,"this wont even start........ Datz confidence..""",,
899,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
1127,ham,"Height of \Oh shit....!!\"" situation: A guy th...",".;-):-D""",,
1266,ham,\Hey sorry I didntgive ya a a bellearlier hunny,just been in bedbut mite go 2 thepub l8tr if u...,,
1384,ham,"Storming msg: Wen u lift d phne, u say \HELLO\...","bt not his girlfrnd... G o o d n i g h t . . .@""",,


В неименованных столбца содержатся какие-то комментарии, не представляющие инетерса для нашего исследования. Поэтому их можно удалить из датасета.

Также можно переименовать переменные, чтобы не запутаться в дальнейшем.

И переведем в бинарный вид значения спам/не спам

In [None]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
data = data.rename(columns={'v1': 'is_spam', 'v2': 'text'})

In [None]:
data['is_spam'].unique()

array(['ham', 'spam'], dtype=object)

In [None]:
data['is_spam'] = data['is_spam'].replace({'spam': 1, 'ham': 0})

In [None]:
data.head()

Unnamed: 0,is_spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Проверим на дубли и удалим их в случае наличия.

In [None]:
data.duplicated().sum()

403

In [None]:
data = data.drop_duplicates(keep='first')

In [None]:
data.shape

(5169, 2)

У нас получилось 5169 сообщений, на которых мы будем строить модели.

Посмотрим, как распределена целевая переменная.

In [None]:
data['is_spam'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_spam,Unnamed: 1_level_1
0,0.87367
1,0.12633


Запомним, что у нас классы несбалансированы, чтобы потом интерпретировать модели.

## 2. Предобработка текста

### 2.1. Стемминг текстов сообщений

In [None]:
stemmer = SnowballStemmer('english')
data['stems'] = [' '.join([stemmer.stem(word) for word in word_tokenize(text)]) for text in data['text']]

### 2.2. Лемматизация текстов сообщений

In [None]:
lemmatizer = WordNetLemmatizer()
data['lemmas'] = [' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]) for text in data['text']]

### 2.3. Исключение стоп-слов из текстов сообщений

In [None]:
stop_words = set(stopwords.words("english"))
data['stopwords_out'] = [' '.join([word for word in nltk.word_tokenize(text) if word.lower() not in stop_words]) for text in data['text']]


Посмотрим, что у нас получилось.

In [None]:
data.head(10)

Unnamed: 0,is_spam,text,stems,lemmas,stopwords_out
0,0,"Go until jurong point, crazy.. Available only ...","go until jurong point , crazi .. avail onli in...","Go until jurong point , crazy .. Available onl...","Go jurong point , crazy .. Available bugis n g..."
1,0,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...,Ok lar ... Joking wif u oni ...,Ok lar ... Joking wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in 2 a wkli comp to win fa cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say so earli hor ... u c alreadi then sa...,U dun say so early hor ... U c already then sa...,U dun say early hor ... U c already say ...
4,0,"Nah I don't think he goes to usf, he lives aro...","nah i do n't think he goe to usf , he live aro...","Nah I do n't think he go to usf , he life arou...","Nah n't think goes usf , lives around though"
5,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darl it 's been 3 week 's no...,FreeMsg Hey there darling it 's been 3 week 's...,FreeMsg Hey darling 's 3 week 's word back ! '...
6,0,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me ....,Even my brother is not like to speak with me ....,Even brother like speak . treat like aids pate...
7,0,As per your request 'Melle Melle (Oru Minnamin...,as per your request mell mell ( oru minnaminun...,As per your request 'Melle Melle ( Oru Minnami...,per request 'Melle Melle ( Oru Minnaminunginte...
8,1,WINNER!! As a valued network customer you have...,winner ! ! as a valu network custom you have b...,WINNER ! ! As a valued network customer you ha...,WINNER ! ! valued network customer selected re...
9,1,Had your mobile 11 months or more? U R entitle...,had your mobil 11 month or more ? u r entitl t...,Had your mobile 11 month or more ? U R entitle...,mobile 11 months ? U R entitled Update latest ...


## 3. Векторизация текста

In [None]:
train, test = train_test_split(data, test_size=0.1, shuffle=True)
train.reset_index(inplace=True)
test.reset_index(inplace=True)

In [None]:
y = train.is_spam.values
y_test = test.is_spam.values

### 3.1. Мешок слов (Bag of Words)

#### 3.1.1. Для первичного текста

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.4)
X_bag = vectorizer.fit_transform(train.text)
X_bag_test = vectorizer.transform(test.text)

In [None]:
# размерность выборок
X_bag.shape, X_bag_test.shape

((4652, 1544), (517, 1544))

#### 3.1.2. Для текста после стемминга

In [None]:
X_bag_stemm = vectorizer.fit_transform(train.stems)
X_bag_stemm_test = vectorizer.transform(test.stems)

#### 3.1.3. Для текста после лемматизации

In [None]:
X_bag_lemmas = vectorizer.fit_transform(train.lemmas)
X_bag_lemmas_test = vectorizer.transform(test.lemmas)

#### 3.1.4. Для текста после исключения стоп-слов

In [None]:
X_bag_stop = vectorizer.fit_transform(train.stopwords_out)
X_bag_stop_test = vectorizer.transform(test.stopwords_out)

### 3.2. TF-IDF

#### 3.2.1. Для первичного текста

In [None]:
vectorizer_tf = TfidfVectorizer(min_df=5, max_df=0.4)
X_tf = vectorizer_tf.fit_transform(train.text)
X_tf_test = vectorizer_tf.transform(test.text)

In [None]:
# размерность обучающей выборки
X_tf.shape, X_tf_test.shape

((4652, 1544), (517, 1544))

#### 3.2.2. Для текста после стемминга

In [None]:
X_tf_stemm = vectorizer_tf.fit_transform(train.stems)
X_tf_stemm_test = vectorizer_tf.transform(test.stems)

#### 3.2.3. Для текста после лемматизации

In [None]:
X_tf_lemmas = vectorizer_tf.fit_transform(train.lemmas)
X_tf_lemmas_test = vectorizer_tf.transform(test.lemmas)

#### 3.1.4. Для текста после исключения стоп-слов

In [None]:
X_tf_stop = vectorizer_tf.fit_transform(train.stopwords_out)
X_tf_stop_test = vectorizer_tf.transform(test.stopwords_out)

## 4. Моделирование

In [None]:
# функция для создания моделей и вывода метрик
def get_metrics(X, y, X_test, y_test, model):
  m = model
  m.fit(X, y)
  preds = m.predict(X_test)
  probas = m.predict_proba(X_test)
  accuracy = accuracy_score(y_test, preds)
  f1 = f1_score(y_test, preds)
  mse = mean_squared_error(y_test, preds)
  roc_auc = roc_auc_score(y_test, probas[:, 1])
  return accuracy, f1, roc_auc

### 4.1. Модели на основе мешка слов

#### 4.1.1. Логистическая регрессия

In [None]:
# для первичного текста
accuracy_lg_bag, f1_lg_bag, roc_auc_lg_bag = get_metrics(X_bag, y,
                                                         X_bag_test, y_test,
                                                         LogisticRegression(C=0.1))
# для текста после стемминга
accuracy_lg_bag_stemm, f1_lg_bag_stemm, roc_auc_lg_bag_stemm = get_metrics(X_bag_stemm, y,
                                                                           X_bag_stemm_test, y_test,
                                                                           LogisticRegression(C=0.1))
# для текста после лемматизации
accuracy_lg_bag_lemmas, f1_lg_bag_lemmas, roc_auc_lg_bag_lemmas = get_metrics(X_bag_lemmas, y,
                                                                              X_bag_lemmas_test, y_test,
                                                                              LogisticRegression(C=0.1))
# для текста после исключения стоп-слов
accuracy_lg_bag_stop, f1_lg_bag_stop, roc_auc_lg_bag_stop = get_metrics(X_bag_stop, y,
                                                                        X_bag_stop_test, y_test,
                                                                        LogisticRegression(C=0.1))

#### 4.1.2. KNN

In [None]:
# для первичного текста
accuracy_knn_bag, f1_knn_bag, roc_auc_knn_bag = get_metrics(X_bag, y,
                                                         X_bag_test, y_test,
                                                         KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после стемминга
accuracy_knn_bag_stemm, f1_knn_bag_stemm, roc_auc_knn_bag_stemm = get_metrics(X_bag_stemm, y,
                                                                           X_bag_stemm_test, y_test,
                                                                           KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после лемматизации
accuracy_knn_bag_lemmas, f1_knn_bag_lemmas, roc_auc_knn_bag_lemmas = get_metrics(X_bag_lemmas, y,
                                                                              X_bag_lemmas_test, y_test,
                                                                              KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после исключения стоп-слов
accuracy_knn_bag_stop, f1_knn_bag_stop, roc_auc_knn_bag_stop = get_metrics(X_bag_stop, y,
                                                                        X_bag_stop_test, y_test,
                                                                        KNeighborsClassifier(n_neighbors=10, metric='cosine'))

#### 4.1.3. Дерево решений

In [None]:
# для первичного текста
accuracy_dt_bag, f1_dt_bag, roc_auc_dt_bag = get_metrics(X_bag, y,
                                                         X_bag_test, y_test,
                                                         DecisionTreeClassifier(max_depth=3))
# для текста после стемминга
accuracy_dt_bag_stemm, f1_dt_bag_stemm, roc_auc_dt_bag_stemm = get_metrics(X_bag_stemm, y,
                                                                           X_bag_stemm_test, y_test,
                                                                           DecisionTreeClassifier(max_depth=3))
# для текста после лемматизации
accuracy_dt_bag_lemmas, f1_dt_bag_lemmas, roc_auc_dt_bag_lemmas = get_metrics(X_bag_lemmas, y,
                                                                              X_bag_lemmas_test, y_test,
                                                                              DecisionTreeClassifier(max_depth=3))
# для текста после исключения стоп-слов
accuracy_dt_bag_stop, f1_dt_bag_stop, roc_auc_dt_bag_stop = get_metrics(X_bag_stop, y,
                                                                        X_bag_stop_test, y_test,
                                                                        DecisionTreeClassifier(max_depth=3))

#### 4.1.4. RandomForest

In [None]:
# для первичного текста
accuracy_rf_bag, f1_rf_bag, roc_auc_rf_bag = get_metrics(X_bag, y,
                                                         X_bag_test, y_test,
                                                         RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после стемминга
accuracy_rf_bag_stemm, f1_rf_bag_stemm, roc_auc_rf_bag_stemm = get_metrics(X_bag_stemm, y,
                                                                           X_bag_stemm_test, y_test,
                                                                           RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после лемматизации
accuracy_rf_bag_lemmas, f1_rf_bag_lemmas, roc_auc_rf_bag_lemmas = get_metrics(X_bag_lemmas, y,
                                                                              X_bag_lemmas_test, y_test,
                                                                              RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после исключения стоп-слов
accuracy_rf_bag_stop, f1_rf_bag_stop, roc_auc_rf_bag_stop = get_metrics(X_bag_stop, y,
                                                                        X_bag_stop_test, y_test,
                                                                        RandomForestClassifier(n_estimators=100, max_depth=20))

#### 4.1.5. Наивный байесовский классификатор

In [None]:
# для первичного текста
accuracy_nb_bag, f1_nb_bag, roc_auc_nb_bag = get_metrics(X_bag, y,
                                                         X_bag_test, y_test,
                                                         MultinomialNB(alpha=1.))
# для текста после стемминга
accuracy_nb_bag_stemm, f1_nb_bag_stemm, roc_auc_nb_bag_stemm = get_metrics(X_bag_stemm, y,
                                                                           X_bag_stemm_test, y_test,
                                                                           MultinomialNB(alpha=1.))
# для текста после лемматизации
accuracy_nb_bag_lemmas, f1_nb_bag_lemmas, roc_auc_nb_bag_lemmas = get_metrics(X_bag_lemmas, y,
                                                                              X_bag_lemmas_test, y_test,
                                                                              MultinomialNB(alpha=1.))
# для текста после исключения стоп-слов
accuracy_nb_bag_stop, f1_nb_bag_stop, roc_auc_nb_bag_stop = get_metrics(X_bag_stop, y,
                                                                        X_bag_stop_test, y_test,
                                                                        MultinomialNB(alpha=1.))

### 4.2. Модели на основе TF-IDF

#### 4.2.1. Логистическая регрессия

In [None]:
# для первичного текста
accuracy_lg_tf, f1_lg_tf, roc_auc_lg_tf = get_metrics(X_tf, y,
                                                         X_tf_test, y_test,
                                                         LogisticRegression(C=0.1))
# для текста после стемминга
accuracy_lg_tf_stemm, f1_lg_tf_stemm, roc_auc_lg_tf_stemm = get_metrics(X_tf_stemm, y,
                                                                           X_tf_stemm_test, y_test,
                                                                           LogisticRegression(C=0.1))
# для текста после лемматизации
accuracy_lg_tf_lemmas, f1_lg_tf_lemmas, roc_auc_lg_tf_lemmas = get_metrics(X_tf_lemmas, y,
                                                                              X_tf_lemmas_test, y_test,
                                                                              LogisticRegression(C=0.1))
# для текста после исключения стоп-слов
accuracy_lg_tf_stop, f1_lg_tf_stop, roc_auc_lg_tf_stop = get_metrics(X_tf_stop, y,
                                                                        X_tf_stop_test, y_test,
                                                                        LogisticRegression(C=0.1))

#### 4.2.2. KNN

In [None]:
# для первичного текста
accuracy_knn_tf, f1_knn_tf, roc_auc_knn_tf = get_metrics(X_tf, y,
                                                         X_tf_test, y_test,
                                                         KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после стемминга
accuracy_knn_tf_stemm, f1_knn_tf_stemm, roc_auc_knn_tf_stemm = get_metrics(X_tf_stemm, y,
                                                                           X_tf_stemm_test, y_test,
                                                                           KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после лемматизации
accuracy_knn_tf_lemmas, f1_knn_tf_lemmas, roc_auc_knn_tf_lemmas = get_metrics(X_tf_lemmas, y,
                                                                              X_tf_lemmas_test, y_test,
                                                                              KNeighborsClassifier(n_neighbors=10, metric='cosine'))
# для текста после исключения стоп-слов
accuracy_knn_tf_stop, f1_knn_tf_stop, roc_auc_knn_tf_stop = get_metrics(X_tf_stop, y,
                                                                        X_tf_stop_test, y_test,
                                                                        KNeighborsClassifier(n_neighbors=10, metric='cosine'))

#### 4.2.3. Дерево решений

In [None]:
# для первичного текста
accuracy_dt_tf, f1_dt_tf, roc_auc_dt_tf = get_metrics(X_tf, y,
                                                         X_tf_test, y_test,
                                                         DecisionTreeClassifier(max_depth=3))
# для текста после стемминга
accuracy_dt_tf_stemm, f1_dt_tf_stemm, roc_auc_dt_tf_stemm = get_metrics(X_tf_stemm, y,
                                                                           X_tf_stemm_test, y_test,
                                                                           DecisionTreeClassifier(max_depth=3))
# для текста после лемматизации
accuracy_dt_tf_lemmas, f1_dt_tf_lemmas, roc_auc_dt_tf_lemmas = get_metrics(X_tf_lemmas, y,
                                                                              X_tf_lemmas_test, y_test,
                                                                              DecisionTreeClassifier(max_depth=3))
# для текста после исключения стоп-слов
accuracy_dt_tf_stop, f1_dt_tf_stop, roc_auc_dt_tf_stop = get_metrics(X_tf_stop, y,
                                                                        X_tf_stop_test, y_test,
                                                                        DecisionTreeClassifier(max_depth=3))

#### 4.2.4. RandomForest

In [None]:
# для первичного текста
accuracy_rf_tf, f1_rf_tf, roc_auc_rf_tf = get_metrics(X_tf, y,
                                                         X_tf_test, y_test,
                                                         RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после стемминга
accuracy_rf_tf_stemm, f1_rf_tf_stemm, roc_auc_rf_tf_stemm = get_metrics(X_tf_stemm, y,
                                                                           X_tf_stemm_test, y_test,
                                                                           RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после лемматизации
accuracy_rf_tf_lemmas, f1_rf_tf_lemmas, roc_auc_rf_tf_lemmas = get_metrics(X_tf_lemmas, y,
                                                                              X_tf_lemmas_test, y_test,
                                                                              RandomForestClassifier(n_estimators=100, max_depth=20))
# для текста после исключения стоп-слов
accuracy_rf_tf_stop, f1_rf_tf_stop, roc_auc_rf_tf_stop = get_metrics(X_tf_stop, y,
                                                                        X_tf_stop_test, y_test,
                                                                        RandomForestClassifier(n_estimators=100, max_depth=20))

#### 4.2.5. Наивный байесовский классификатор

In [None]:
# для первичного текста
accuracy_nb_tf, f1_nb_tf, roc_auc_nb_tf = get_metrics(X_tf, y,
                                                         X_tf_test, y_test,
                                                         MultinomialNB(alpha=1.))
# для текста после стемминга
accuracy_nb_tf_stemm, f1_nb_tf_stemm, roc_auc_nb_tf_stemm = get_metrics(X_tf_stemm, y,
                                                                           X_tf_stemm_test, y_test,
                                                                           MultinomialNB(alpha=1.))
# для текста после лемматизации
accuracy_nb_tf_lemmas, f1_nb_tf_lemmas, roc_auc_nb_tf_lemmas = get_metrics(X_tf_lemmas, y,
                                                                              X_tf_lemmas_test, y_test,
                                                                              MultinomialNB(alpha=1.))
# для текста после исключения стоп-слов
accuracy_nb_tf_stop, f1_nb_tf_stop, roc_auc_nb_tf_stop = get_metrics(X_tf_stop, y,
                                                                        X_tf_stop_test, y_test,
                                                                        MultinomialNB(alpha=1.))

## 5. Сравнительный анализ

In [None]:
vectorization_approaches = ['Bag of Words', 'TF-IDF']
models = ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'Naive Bayes']
text_processing_methods = ['No Processing', 'Stemming', 'Lemmatization', 'Stop Words Removal']
accuracies = [accuracy_lg_bag, accuracy_lg_bag_stemm, accuracy_lg_bag_lemmas, accuracy_lg_bag_stop, accuracy_knn_bag,
              accuracy_knn_bag_stemm, accuracy_knn_bag_lemmas, accuracy_knn_bag_stop, accuracy_dt_bag, accuracy_dt_bag_stemm,
              accuracy_dt_bag_lemmas, accuracy_dt_bag_stop, accuracy_rf_bag, accuracy_rf_bag_stemm, accuracy_rf_bag_lemmas,
              accuracy_rf_bag_stop, accuracy_nb_bag, accuracy_nb_bag_stemm, accuracy_nb_bag_lemmas, accuracy_nb_bag_stop,
              accuracy_lg_tf, accuracy_lg_tf_stemm, accuracy_lg_tf_lemmas, accuracy_lg_tf_stop, accuracy_knn_tf,
              accuracy_knn_tf_stemm, accuracy_knn_tf_lemmas, accuracy_knn_tf_stop, accuracy_dt_tf, accuracy_dt_tf_stemm,
              accuracy_dt_tf_lemmas, accuracy_dt_tf_stop, accuracy_rf_tf, accuracy_rf_tf_stemm, accuracy_rf_tf_lemmas,
              accuracy_rf_tf_stop, accuracy_nb_tf, accuracy_nb_tf_stemm, accuracy_nb_tf_lemmas, accuracy_nb_tf_stop]
f1_scores = [f1_lg_bag, f1_lg_bag_stemm, f1_lg_bag_lemmas, f1_lg_bag_stop, f1_knn_bag,
              f1_knn_bag_stemm, f1_knn_bag_lemmas, f1_knn_bag_stop, f1_dt_bag, f1_dt_bag_stemm,
              f1_dt_bag_lemmas, f1_dt_bag_stop, f1_rf_bag, f1_rf_bag_stemm, f1_rf_bag_lemmas,
              f1_rf_bag_stop, f1_nb_bag, f1_nb_bag_stemm, f1_nb_bag_lemmas, f1_nb_bag_stop,
              f1_lg_tf, f1_lg_tf_stemm, f1_lg_tf_lemmas, f1_lg_tf_stop, f1_knn_tf,
              f1_knn_tf_stemm, f1_knn_tf_lemmas, f1_knn_tf_stop, f1_dt_tf, f1_dt_tf_stemm,
              f1_dt_tf_lemmas, f1_dt_tf_stop, f1_rf_tf, f1_rf_tf_stemm, f1_rf_tf_lemmas,
              f1_rf_tf_stop, f1_nb_tf, f1_nb_tf_stemm, f1_nb_tf_lemmas, f1_nb_tf_stop]
roc_aucs = [roc_auc_lg_bag, roc_auc_lg_bag_stemm, roc_auc_lg_bag_lemmas, roc_auc_lg_bag_stop, roc_auc_knn_bag,
              roc_auc_knn_bag_stemm, roc_auc_knn_bag_lemmas, roc_auc_knn_bag_stop, roc_auc_dt_bag, roc_auc_dt_bag_stemm,
              roc_auc_dt_bag_lemmas, roc_auc_dt_bag_stop, roc_auc_rf_bag, roc_auc_rf_bag_stemm, roc_auc_rf_bag_lemmas,
              roc_auc_rf_bag_stop, roc_auc_nb_bag, roc_auc_nb_bag_stemm, roc_auc_nb_bag_lemmas, roc_auc_nb_bag_stop,
              roc_auc_lg_tf, roc_auc_lg_tf_stemm, roc_auc_lg_tf_lemmas, roc_auc_lg_tf_stop, roc_auc_knn_tf,
              roc_auc_knn_tf_stemm, roc_auc_knn_tf_lemmas, roc_auc_knn_tf_stop, roc_auc_dt_tf, roc_auc_dt_tf_stemm,
              roc_auc_dt_tf_lemmas, roc_auc_dt_tf_stop, roc_auc_rf_tf, roc_auc_rf_tf_stemm, roc_auc_rf_tf_lemmas,
              roc_auc_rf_tf_stop, roc_auc_nb_tf, roc_auc_nb_tf_stemm, roc_auc_nb_tf_lemmas, roc_auc_nb_tf_stop]

combinations = list(product(vectorization_approaches, models, text_processing_methods))
result = []
for approach, model, method in combinations:
    accuracy = accuracies.pop(0)
    f1 = f1_scores.pop(0)
    roc_auc = roc_aucs.pop(0)
    result.append([approach, model, method, accuracy, f1, roc_auc])
results_df = pd.DataFrame.from_records(result, columns=['Vectorization Approach',
                                                      'Model',
                                                      'Text Processing Method',
                                                      'Accuracy', 'F1', 'ROC-AUC'])
results_df['Accuracy'] = results_df['Accuracy'].round(3)
results_df['F1'] = results_df['F1'].round(3)
results_df['ROC-AUC'] = results_df['ROC-AUC'].round(3)
results_df.sort_values(by='F1', ascending=False)

Unnamed: 0,Vectorization Approach,Model,Text Processing Method,Accuracy,F1,ROC-AUC
16,Bag of Words,Naive Bayes,No Processing,0.994,0.974,0.991
18,Bag of Words,Naive Bayes,Lemmatization,0.994,0.974,0.994
17,Bag of Words,Naive Bayes,Stemming,0.994,0.974,0.993
19,Bag of Words,Naive Bayes,Stop Words Removal,0.986,0.94,0.994
33,TF-IDF,Random Forest,Stemming,0.986,0.935,0.994
39,TF-IDF,Naive Bayes,Stop Words Removal,0.985,0.926,0.997
38,TF-IDF,Naive Bayes,Lemmatization,0.985,0.926,0.998
37,TF-IDF,Naive Bayes,Stemming,0.985,0.926,0.997
36,TF-IDF,Naive Bayes,No Processing,0.985,0.926,0.996
3,Bag of Words,Logistic Regression,Stop Words Removal,0.983,0.916,0.994


**Вывод:**

Мы видим, что лучшими метриками обладает наивный байесовский классификатор. Причем на значение метрик хоть и не очень сильно, но оказывает влияние, какое сочетание векторизации и предпроцессинга мы выбираем. Лучше всего работает векторизация TF-IDF без предпроцессинга и с простым исключением стоп-слов. Если применять мешок слов, то лучше выбрать лемматизацию или не делать предпроцессинг вовсе.

Из-за несбалансированности классов дерево решений, а также логистическая регрессия при векторизация TF-IDF показали наихудший результат, особенно в части f1-score.