In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('train_spam.csv')
df_test = pd.read_csv('test_spam.csv')



In [4]:
df.shape

(16278, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4070 entries, 0 to 4069
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4070 non-null   object
dtypes: object(1)
memory usage: 31.9+ KB


In [7]:
df.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [8]:
df['text_type'].value_counts()

ham     11469
spam     4809
Name: text_type, dtype: int64

In [9]:
df.groupby('text_type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
text_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,11469,11458,SPAM ALERT 🚔 User: Username: @DillyBubbl...,7
spam,4809,4809,plzz visit my website moviesgodml to get all m...,1


In [10]:
# Convert every row to lower register
df['text'] = df['text'].apply(lambda x: x.lower())
df_test['text'] = df_test['text'].apply(lambda x: x.lower())

In [11]:
df['text']

0        make sure alex knows his birthday is over in f...
1        a resume for john lavorato thanks vince i will...
2        plzz visit my website moviesgodml to get all m...
3        urgent your mobile number has been awarded wit...
4        overview of hr associates analyst project per ...
                               ...                        
16273    if you are interested in binary options tradin...
16274    dirty pictureblyk on aircel thanks you for bei...
16275    or you could do this g on mon 1635465 sep 1635...
16276    insta reels par 80 गंद bhara pada hai 👀 kuch b...
16277    alex s paper comments 1 in the sentence betwee...
Name: text, Length: 16278, dtype: object

### ПРЕДОБРАБОТКА ТЕКСТА СООБЩЕНИЙ

In [12]:
import re
from nltk.stem import WordNetLemmatizer
import sklearn
import codecs
import pymorphy2
import seaborn as sns
from nltk.stem.snowball import SnowballStemmer

In [13]:
df

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


In [14]:
df['text_type'] = df['text_type'].replace({'ham':0,'spam':1})

In [15]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('russian'))

def df_preprocess(text):    
    reg = re.compile('[^a-zA-Z0-9 ]') 
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'cite', text)
    text = re.sub('@[^\s]+', 'user', text)
    text = reg.sub(' ', text)
    
    # Лемматизация
    # morph = pymorphy2.MorphAnalyzer()
    # text =[morph.parse(word)[0].normal_form for word in text.split()]

    # Стемминг
    # stemmer = SnowballStemmer("english")
    # text =[stemmer.stem(word) for word in text.split()]

    #Стемминг с удалением стопслов
    stemmer = SnowballStemmer("english")
    text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stopWords])

    return text

In [16]:
#Проверка лемматизации на 1м примере
print(df['text'].iloc[0])
%time print(df_preprocess(df['text'].iloc[0]))

make sure alex knows his birthday is over in fifteen minutes as far as youre concerned
make sure alex know his birthday is over in fifteen minut as far as your concern
CPU times: total: 0 ns
Wall time: 0 ns


In [17]:
# предобработка текста и лемматизация датасета потребовала бы +- 48 минут

In [18]:
#Проверка стемминга на 1м примере
print(df['text'].iloc[0])
%time print(df_preprocess(df['text'].iloc[0]))

make sure alex knows his birthday is over in fifteen minutes as far as youre concerned
make sure alex know his birthday is over in fifteen minut as far as your concern
CPU times: total: 0 ns
Wall time: 0 ns


In [19]:
# предобработка текста и стемминг датасета потребовала бы меньше минуты

In [20]:
#Стемминг с удалением стоп-слов займет меньше минуты времени

In [21]:
df['text'] = df['text'].apply(df_preprocess)

In [22]:
df.head()

Unnamed: 0,text_type,text
0,0,make sure alex know his birthday is over in fi...
1,0,a resum for john lavorato thank vinc i will ge...
2,1,plzz visit my websit moviesgodml to get all mo...
3,1,urgent your mobil number has been award with a...
4,0,overview of hr associ analyst project per davi...


In [23]:
df_test['text'] = df_test['text'].apply(df_preprocess)

In [24]:
X_train = df['text']
X_test = df_test['text']
y_train = df['text_type']

### Использую векторизатор BOW с биграмами

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2))

X_train_BOW_bi = vectorizer.fit_transform(X_train)
X_test_BOW_bi = vectorizer.transform(X_test)

print(X_train_BOW_bi.shape, X_test_BOW_bi.shape)
     

(16278, 334737) (4070, 334737)


### Использую векторизатор TF-IDF с биграмами

In [73]:
from sklearn.feature_extraction.text import  TfidfVectorizer
vectorizer_TFIDF = TfidfVectorizer(ngram_range=(1,2))

X_train_TFIDF_bi = vectorizer_TFIDF.fit_transform(X_train)
X_test_TFIDF_bi = vectorizer_TFIDF.transform(X_test)

print(X_train_TFIDF_bi.shape, X_test_TFIDF_bi.shape)

(16278, 38065) (4070, 38065)


### Сделаю отбор фичей для каждого типа векторизации

In [27]:
from sklearn.feature_selection import SelectFromModel

from sklearn.svm import LinearSVC

In [28]:
lsvc = LinearSVC(C = .5,max_iter=10000) # C = 0.5
selective_model = SelectFromModel(lsvc,  max_features  = None)

In [29]:
X_train_BOW_bi_select_features = selective_model.fit_transform(X_train_BOW_bi,y_train)
X_test_BOW_bi_select_features = selective_model.transform(X_test_BOW_bi)
print('New shapes: ', X_train_BOW_bi_select_features.shape, X_test_BOW_bi_select_features.shape)

New shapes:  (16278, 66155) (4070, 66155)


In [30]:
X_train_TF_IDF_bi_select_features = selective_model.fit_transform(X_train_TFIDF_bi,y_train)
X_test_TF_IDF_bi_select_features = selective_model.transform(X_test_TFIDF_bi)
print('New shapes: ', X_train_TF_IDF_bi_select_features.shape, X_test_TF_IDF_bi_select_features.shape)

New shapes:  (16278, 102112) (4070, 102112)


In [51]:
#В TF_IDF слишком много признаков для обучения, поэтому я буду использовать ее модель без отбора признаков. Это не самый лучший вариант, но метрики изменятся не сильно

In [71]:
from sklearn.feature_extraction.text import  TfidfVectorizer
vectorizer_TFIDF = TfidfVectorizer()

X_train_TFIDF = vectorizer_TFIDF.fit_transform(X_train)
X_test_TFIDF = vectorizer_TFIDF.transform(X_test)

print(X_train_TFIDF.shape, X_test_TFIDF.shape)

(16278, 38065) (4070, 38065)


## Теперь перехожу к обучению моделей

In [53]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [54]:
cv = cross_validate(LogisticRegression(max_iter=1000),X_train_BOW_bi_select_features,y_train,cv=5,scoring=['accuracy','f1','roc_auc'])

### Проверка работоспособности модели на базовой логистической регрессии

In [55]:
print(f'Accuracy: {np.mean(cv["test_accuracy"])}')
print(f'f1: {np.mean(cv["test_f1"])}')
print(f'roc_auc: {np.mean(cv["test_roc_auc"])}')

Accuracy: 0.9443421196647004
f1: 0.8999804864383044
roc_auc: 0.9785849515191487


In [56]:
LogisticRegression(max_iter=1000).get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 1000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### Даже "из коробки" логистическая регрессия показывает достаточно высокие результаты

#### Попробую обучить модель на данных, векторизованных с помощью TF-IDF

In [57]:
cv_tf_idf = cross_validate(LogisticRegression(max_iter=1000),X_train_TF_IDF_bi_select_features,y_train,cv=5,scoring=['accuracy','f1','roc_auc'])

### Проверка работоспособности модели на базовой логистической регрессии

In [58]:
print(f'Accuracy: {np.mean(cv_tf_idf["test_accuracy"])}')
print(f'f1: {np.mean(cv_tf_idf["test_f1"])}')
print(f'roc_auc: {np.mean(cv_tf_idf["test_roc_auc"])}')

Accuracy: 0.9062543167381879
f1: 0.8176817759490026
roc_auc: 0.9764501648718398


### метрики уменьшились несильно

## Однако попробую подобрать наилучшие гиперпараметры 

In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [63]:
param_grid_for_logreg = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l2', 'l1'], 'solver': ['liblinear']}, 
              {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg']}]

In [64]:
gridsearch = GridSearchCV(LogisticRegression(max_iter=1000), 
                          param_grid=param_grid_for_logreg,
                         cv = 5,
                         )

In [65]:
gridsearch.fit(X_train_BOW_bi_select_features,y_train)

In [66]:
gridsearch.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [67]:
best_logreg_bow_model = gridsearch.best_estimator_

In [68]:
cv_best_logreg_bow = cross_validate(best_logreg_bow_model, X_train_BOW_bi_select_features, y_train, cv=5,scoring=['accuracy','f1','roc_auc'])

In [69]:
print(f'Accuracy: {np.mean(cv_best_logreg_bow["test_accuracy"])}')
print(f'f1: {np.mean(cv_best_logreg_bow["test_f1"])}')
print(f'roc_auc: {np.mean(cv_best_logreg_bow["test_roc_auc"])}')

Accuracy: 0.9444035635971121
f1: 0.9001029250843484
roc_auc: 0.9785512323170732


### Метрики немного улучшились

### Теперь попробую подобрать для модели с векторизацией TF-IDF

In [74]:
gridsearch_tf_idf = GridSearchCV(LogisticRegression(max_iter=1000), 
                          param_grid=param_grid_for_logreg,
                         cv = 5,
                         )

In [75]:
%time gridsearch_tf_idf.fit(X_train_TFIDF,y_train)

CPU times: total: 1min 52s
Wall time: 1min 56s


In [76]:
gridsearch_tf_idf.best_params_

{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

In [77]:
best_logreg_tf_idf_model = gridsearch_tf_idf.best_estimator_

In [78]:
cv_best_logreg_tf_idf = cross_validate(best_logreg_tf_idf_model, X_train_TFIDF, y_train, cv=5,scoring=['accuracy','f1','roc_auc'])

In [79]:
print(f'Accuracy: {np.mean(cv_best_logreg_tf_idf["test_accuracy"])}')
print(f'f1: {np.mean(cv_best_logreg_tf_idf["test_f1"])}')
print(f'roc_auc: {np.mean(cv_best_logreg_tf_idf["test_roc_auc"])}')

Accuracy: 0.9354960993670671
f1: 0.8858835189802573
roc_auc: 0.9780317045678772


### Метрики здесь немного хуже, чем у BOW

### Теперь попробую применить модель наивного байесовского классификатора

#### Тк у него нет параметров для оптимизации, сделаю просто кросс-валидацию

In [80]:
from sklearn.naive_bayes import GaussianNB

In [82]:
cv_nb_bow = cross_validate(GaussianNB(),
                       X_train_BOW_bi_select_features.toarray(), 
                       y_train,
                       cv=5,
                       scoring=['accuracy','f1','roc_auc'])

In [83]:
print(f'Accuracy: {np.mean(cv_nb_bow["test_accuracy"])}')
print(f'f1: {np.mean(cv_nb_bow["test_f1"])}')
print(f'roc_auc: {np.mean(cv_nb_bow["test_roc_auc"])}')

Accuracy: 0.911782930815189
f1: 0.837094840651335
roc_auc: 0.870308429472451


### Метрики ухудшились

In [None]:
cv_nb_tf_idf = cross_validate(GaussianNB(),
                       X_train_TFIDF.toarray(), 
                       y_train,
                       cv=5,
                       scoring=['accuracy','f1','roc_auc'])

In [86]:
print(f'Accuracy: {np.mean(cv_nb_tf_idf["test_accuracy"])}')
print(f'f1: {np.mean(cv_nb_tf_idf["test_f1"])}')
print(f'roc_auc: {np.mean(cv_nb_tf_idf["test_roc_auc"])}')

Accuracy: 0.7644059790833984
f1: 0.681719383910161
roc_auc: 0.7903776672567971


### Метрики сильно ухудшились

### Я выбрал модель логистической регрессии и наивного байесовского классификатора, поскольку в данной статье: https://habr.com/ru/articles/768470/ приведены доказательства в пользу них в качестве наилучших моделей для классификации спама

##### По итогам, наилучшая модель классификации - модель логистической регрессии с параметрами:{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}. А векторизация BOW

In [91]:
the_best_model = best_logreg_bow_model

In [92]:
cv_the_best = cross_validate(the_best_model, X_train_BOW_bi_select_features, y_train, cv=5,scoring=['accuracy','f1','roc_auc'])

In [93]:
print(f'Accuracy: {np.mean(cv_the_best["test_accuracy"])}')
print(f'f1: {np.mean(cv_the_best["test_f1"])}')
print(f'roc_auc: {np.mean(cv_the_best["test_roc_auc"])}')

Accuracy: 0.9444035635971121
f1: 0.9001029250843484
roc_auc: 0.9785512323170732


### ЛУЧШИЙ СКОРИНГ МОДЕЛИ
Accuracy: 0.9444035635971121   
f1: 0.9001029250843484   
roc_auc: 0.9785512323170732   

In [94]:
y_best_predict = the_best_model.predict(X_test_BOW_bi_select_features)

In [95]:
score_df = pd.DataFrame({'score':y_best_predict,'text':df_test['text']})

In [96]:
score_df.to_csv('score.csv')