# Загрузка данных + импорты

## Импорты

In [1]:
#импорты
#from nltk.tokenize import word_tokenize, RegexpTokenizer
#from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_validate
import pandas as pd
from sklearn.naive_bayes import ComplementNB
#from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedShuffleSplit
from scipy.stats import ttest_ind
from scipy import stats as st
import numpy as np

In [2]:
FILESOURCE = 'SMSSpamCollection.txt' # файл источник


## Чтение файла

In [3]:
# читаем файл
with open(FILESOURCE) as f: 
    full_file = f.read()
print(full_file[:20])

ham	Go until jurong 


**Вывод:** данные удалось прочитать, уже неплохо

# Предобработка

In [4]:
# нижний регистр и разделение на строки.
full_file = full_file.lower()
lst = full_file.splitlines()

In [5]:
# Отделение таргета
lst_with_target = []
for line in lst:
    lst_tmp = []
    lst_line = line.split('\t')  
    if lst_line[0] == 'spam' or lst_line[0] == 'ham':
        lst_tmp.append('ok')
        lst_tmp.append(lst_line[0])
        lst_tmp.append(lst_line[1:])
        lst_with_target.append(lst_tmp)
    else:
        lst_tmp.append('not_ok')
        lst_tmp.append(lst_line[0])
        lst_tmp.append(lst_line[1:])
        lst_with_target.append(lst_tmp)

In [6]:
# проверка на структуру файла
for i in range(len(lst_with_target)):
    if lst_with_target[i][0] != 'ok':
        print(lst_with_target[i], 'is not ok')
    if len(lst_with_target[i]) > 3:
        print(lst_with_target[i], '>3')

**Вывод:** структура файла соблюдается

In [7]:
vectorizer = CountVectorizer(stop_words='english', token_pattern='[a-z]\w+')
X = vectorizer.fit_transform(lst)
#vectorizer.get_feature_names_out()


In [8]:
vectorizer2 = CountVectorizer(token_pattern='[0-9]\w+')
X2 = vectorizer2.fit_transform(lst)
vectorizer3 = CountVectorizer(token_pattern='[§!@#$%^&*()_+-<>:"|\?~{}]')
X3 = vectorizer3.fit_transform(lst)


In [9]:
df = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names_out())
df2 = pd.DataFrame(X2.toarray(), columns= vectorizer2.get_feature_names_out())
df3 = pd.DataFrame(X3.toarray(),columns=vectorizer3.get_feature_names_out())

In [10]:
df['words_with_digits'] = df2.T.sum()
df['special_symbols'] = df3.T.sum()

In [11]:
df['target'] =  (df['spam'] > 0).apply(int)

df = df.drop(['spam', 'ham'], axis=1)

In [12]:
len(vectorizer.get_feature_names_out())

7680

In [13]:
# Создаем транспонированый df
df_t = df.T

## Среднее количество слов в сообщении

In [14]:
df[df['target'] == 1].T.sum().mean()

39.163319946452475

In [15]:
df[df['target'] == 0].T.sum().mean()

10.946550652579242

In [16]:
# разделение
features = df.drop('target',axis=1)
target = df['target']
features_train, features_test, target_train, target_test = train_test_split(features,target, test_size=.3, random_state=123456)
print(features_train.shape, features_test.shape, target_train.shape, target_test.shape )

(3901, 7680) (1673, 7680) (3901,) (1673,)


In [17]:
features_train['len_diff'] = abs(df_t.sum() - features_train[target_train == 1].T.sum().mean())
features_test['len_diff'] = abs(df_t.sum() - features_train[target_train == 1].T.sum().mean())
features['len_diff'] = abs(df_t.sum() - features_train[target_train == 1].T.sum().mean())

### Сбалансированный df

In [18]:
features_train_ones = features_train[target_train == 1]

In [19]:
features_train_zeros = features_train[target_train == 0].sample(n=features_train_ones.shape[0])

In [20]:
features_train_balanced = pd.concat([features_train_ones, features_train_zeros])

In [21]:
features_train_balanced = features_train_balanced.join(target_train)
features_train_balanced = shuffle(features_train_balanced)

In [22]:
target_train_balanced = features_train_balanced['target']

In [23]:
features_train_balanced = features_train_balanced.drop('target', axis=1)

# EDA

## Баланс классов

### Общий баланс классов

In [24]:
df['target'].mean()

0.1340150699677072

### Баланс классов train

In [25]:
target_train.mean()

0.12971033068443988

In [26]:
target_train_balanced.mean()

0.5

### Баланс классов test

In [27]:
target_test.mean()

0.14405260011954574

## Поиск важных слов


In [28]:
df[df['target'] == 1].sum().sort_values(ascending=False)[:25]

special_symbols      15925
words_with_digits     2540
target                 747
free                   226
txt                    166
ur                     144
mobile                 128
stop                   125
text                   125
claim                  113
reply                  104
www                     98
prize                   93
just                    79
cash                    76
won                     76
uk                      74
send                    71
new                     69
nokia                   67
win                     64
urgent                  63
week                    60
tone                    60
service                 56
dtype: int64

In [29]:
df[df['target'] == 0].sum().sort_values(ascending=False)[:25]

special_symbols      18803
words_with_digits      356
gt                     318
lt                     316
just                   297
ok                     288
ll                     266
ur                     247
know                   237
good                   235
got                    234
like                   234
come                   230
day                    217
love                   205
time                   202
going                  169
home                   165
want                   165
lor                    162
don                    159
need                   159
sorry                  157
da                     150
today                  139
dtype: int64

**Вывод:** Можно утверждать, что в spam-сообщениях часто употребляются такие слова как free и txt, а также есть много слов с цифрами

## Модели

In [30]:
cnb = ComplementNB()
cnb_bal = ComplementNB()
lgr = LogisticRegression(max_iter=2000)
cbc = CatBoostClassifier(verbose=0)

In [31]:
scoring = ['f1']
def make_cv2(model_name, model_, cv, scoring,features_train, target_train):
    scores = cross_validate(model_, features_train, target_train, cv=cv, scoring=scoring)
    print(model_name)
    print(f'mean=',scores['test_f1'].mean())
    print(scores['test_f1'])
    print(f'std=',scores['test_f1'].std())

# Вот тут важно

In [32]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=.3)
sss.get_n_splits(features, target)


5

In [33]:
make_cv2('NB not balanced', cnb, 5, scoring, features_train, target_train)
make_cv2('NB balanced', cnb_bal, sss, scoring, features, target)
make_cv2('log_reg_balanced', lgr, sss, scoring, features, target)
make_cv2('catboost', cbc, sss, scoring, features, target)


NB not balanced
mean= 0.8479433295125964
[0.85185185 0.83333333 0.92018779 0.80733945 0.82700422]
std= 0.038820387690466834
NB balanced
mean= 0.8663738691242473
[0.86462882 0.86147186 0.8908686  0.86086957 0.8540305 ]
std= 0.012725742854436383
log_reg_balanced
mean= 0.9397164142698455
[0.9382151  0.91588785 0.95172414 0.94252874 0.95022624]
std= 0.01290430544935203
catboost
mean= 0.91649664443487
[0.93849658 0.92271663 0.91627907 0.89461358 0.91037736]
std= 0.01441559478556849


In [34]:
cnb.fit(features_train,target_train)
cnb_bal.fit(features_train_balanced, target_train_balanced)
lgr.fit(features_train_balanced, target_train_balanced)
cbc.fit(features_train_balanced, target_train_balanced, verbose=0)

<catboost.core.CatBoostClassifier at 0x10eab5520>

In [35]:
def print_model_rep(model_kind, model, features_train, target_train, features_test, target_test):
    print(model_kind)
    print('train')
    prediction = model.predict(features_train)
    print(classification_report(target_train, prediction))
    print('test')
    prediction = model.predict(features_test)
    print(classification_report(target_test, prediction))



In [36]:
print_model_rep('NB not balanced', cnb, features_train, target_train, features_test, target_test)

NB not balanced
train
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3395
           1       0.84      0.93      0.88       506

    accuracy                           0.97      3901
   macro avg       0.91      0.95      0.93      3901
weighted avg       0.97      0.97      0.97      3901

test
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1432
           1       0.93      0.86      0.89       241

    accuracy                           0.97      1673
   macro avg       0.95      0.93      0.94      1673
weighted avg       0.97      0.97      0.97      1673



In [37]:
print_model_rep('NB balanced', cnb_bal, features_train_balanced, target_train_balanced, features_test, target_test)

NB balanced
train
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       506
           1       0.98      0.94      0.96       506

    accuracy                           0.96      1012
   macro avg       0.96      0.96      0.96      1012
weighted avg       0.96      0.96      0.96      1012

test
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1432
           1       0.92      0.86      0.89       241

    accuracy                           0.97      1673
   macro avg       0.95      0.92      0.94      1673
weighted avg       0.97      0.97      0.97      1673



In [38]:
print_model_rep('log_reg_balanced', lgr, features_train_balanced, target_train_balanced, features_test, target_test)


log_reg_balanced
train
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       506
           1       1.00      0.99      1.00       506

    accuracy                           1.00      1012
   macro avg       1.00      1.00      1.00      1012
weighted avg       1.00      1.00      1.00      1012

test
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1432
           1       0.95      0.90      0.93       241

    accuracy                           0.98      1673
   macro avg       0.97      0.95      0.96      1673
weighted avg       0.98      0.98      0.98      1673



In [39]:
print_model_rep('catboost', cbc, features_train_balanced, target_train_balanced, features_test, target_test)


catboost
train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       506
           1       1.00      1.00      1.00       506

    accuracy                           1.00      1012
   macro avg       1.00      1.00      1.00      1012
weighted avg       1.00      1.00      1.00      1012

test
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1432
           1       0.88      0.93      0.91       241

    accuracy                           0.97      1673
   macro avg       0.93      0.95      0.94      1673
weighted avg       0.97      0.97      0.97      1673



## Кросс-валидация

In [40]:
kf = KFold(n_splits=5)
kf.get_n_splits()

5

In [41]:
def get_cv_values(model_kind, model, features, target, kf ):
    num_fold = 0
    for train_index, test_index in kf.split(features):
        num_fold += 1
        print(f'Fold number {num_fold}')
        print_model_rep(model_kind, model, features.loc[train_index], target[train_index], features.loc[test_index], target[test_index])

In [42]:
get_cv_values('NB not balanced', cnb, features, target, kf)

Fold number 1
NB not balanced
train
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3880
           1       0.89      0.88      0.88       579

    accuracy                           0.97      4459
   macro avg       0.93      0.93      0.93      4459
weighted avg       0.97      0.97      0.97      4459

test
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       947
           1       0.94      0.90      0.92       168

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Fold number 2
NB not balanced
train
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3853
           1       0.91      0.88      0.89       606

    accuracy                           0.97      4459
   macro avg       0.94      0.93      0.94      44

In [43]:
get_cv_values('NB balanced', cnb_bal, features, target, kf)

Fold number 1
NB balanced
train
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3880
           1       0.86      0.88      0.87       579

    accuracy                           0.97      4459
   macro avg       0.92      0.93      0.93      4459
weighted avg       0.97      0.97      0.97      4459

test
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       947
           1       0.94      0.89      0.92       168

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Fold number 2
NB balanced
train
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3853
           1       0.89      0.88      0.88       606

    accuracy                           0.97      4459
   macro avg       0.93      0.93      0.93      4459
weigh

In [44]:
get_cv_values('log_reg_balanced', lgr, features, target, kf)



Fold number 1
log_reg_balanced
train
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3880
           1       0.95      0.93      0.94       579

    accuracy                           0.98      4459
   macro avg       0.97      0.96      0.97      4459
weighted avg       0.98      0.98      0.98      4459

test
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       947
           1       0.96      0.93      0.95       168

    accuracy                           0.98      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Fold number 2
log_reg_balanced
train
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3853
           1       0.96      0.93      0.94       606

    accuracy                           0.98      4459
   macro avg       0.97      0.96      0.97      

In [45]:
get_cv_values('catboost', cbc, features, target, kf)

Fold number 1
catboost
train
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3880
           1       0.87      0.95      0.91       579

    accuracy                           0.98      4459
   macro avg       0.93      0.97      0.95      4459
weighted avg       0.98      0.98      0.98      4459

test
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       947
           1       0.92      0.95      0.94       168

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Fold number 2
catboost
train
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3853
           1       0.89      0.95      0.92       606

    accuracy                           0.98      4459
   macro avg       0.94      0.97      0.95      4459
weighted av

## Гипотеза о равенстве средних

In [46]:
samples = []
for _ in range(50):
    samples.append(features.sample(n=100,replace=True))

In [47]:
def get_f1(model, samples):
    f1_lst = []
    for elem in samples:
        elem_features = elem
        elem_target = target[elem_features.index]
        prediction = model.predict(elem_features)
        f1_lst.append(f1_score(elem_target, prediction))
    return f1_lst


In [95]:
def conf_int(f1_lst):
    f1_lst = pd.Series(f1_lst)
    print(f1_lst.mean())
    print(f1_lst.median())
    mean_f1 = f1_lst.mean()
    t_low = f1_lst.quantile(.025)
    t_high = f1_lst.quantile(.975)
    sem_mean = st.sem(f1_lst)
    #print(t_low)
    return mean_f1 + t_low * sem_mean, mean_f1 + t_high * sem_mean
    

In [97]:
def conf_int2(f1_lst):
    lst_mean = np.mean(f1_lst)
    print(lst_mean)
    print(np.median(f1_lst))
    lst_sem = st.sem(f1_lst)
    print(lst_sem)
    return st.t.interval(.95, len(f1_lst)-1, lst_mean, lst_sem)

# Важное2

## Доверительный интервал

In [68]:
get_f1(cnb, samples) 

[0.9166666666666666,
 0.9285714285714286,
 1.0,
 0.9333333333333333,
 0.9523809523809523,
 0.8695652173913043,
 0.9333333333333333,
 0.9302325581395349,
 1.0,
 0.8666666666666667,
 0.9411764705882353,
 0.9600000000000001,
 0.962962962962963,
 0.9714285714285714,
 0.9333333333333333,
 0.9523809523809523,
 0.7826086956521738,
 0.962962962962963,
 0.9473684210526316,
 0.64,
 0.9230769230769231,
 1.0,
 0.7272727272727273,
 0.896551724137931,
 0.888888888888889,
 0.8799999999999999,
 0.896551724137931,
 0.875,
 0.8947368421052632,
 0.9032258064516129,
 0.9142857142857143,
 1.0,
 0.9166666666666666,
 0.8,
 0.9285714285714286,
 0.9473684210526316,
 0.9655172413793104,
 0.8125,
 0.896551724137931,
 0.6666666666666666,
 0.923076923076923,
 0.8571428571428572,
 0.7692307692307693,
 1.0,
 0.7,
 0.846153846153846,
 0.888888888888889,
 0.9285714285714286,
 0.9411764705882353,
 0.9090909090909091]

In [98]:
conf_int(get_f1(cnb_bal, samples))

0.8963061676620419
0.9116883116883117


(0.9034745671806974, 0.906512795021073)

In [99]:
conf_int2(get_f1(cnb_bal, samples))

0.8963061676620419
0.9116883116883117
0.01020662735903114


(0.8757951820936729, 0.9168171532304108)

In [100]:
conf_int(get_f1(lgr, samples))

0.9445215415075287
0.9565217391304348


(0.9511194269057568, 0.9524390039854024)

In [101]:
conf_int2(get_f1(lgr, samples))

0.9445215415075287
0.9565217391304348
0.007917462477873758


(0.928610804991996, 0.9604322780230613)

In [102]:
conf_int(get_f1(cbc, samples))

0.9186219150207688
0.9333333333333333


(0.9254966455441953, 0.9277356431843934)

In [103]:
conf_int2(get_f1(cbc, samples))

0.9186219150207688
0.9333333333333333
0.009113728163624575


(0.900307192609282, 0.9369366374322556)

In [54]:
print(ttest_ind(get_f1(cnb, samples),get_f1(cnb_bal, samples),equal_var=False))

Ttest_indResult(statistic=0.08515458177944575, pvalue=0.9323158233921937)


In [55]:
print(ttest_ind(get_f1(cnb_bal, samples),get_f1(lgr, samples),equal_var=False))

Ttest_indResult(statistic=-3.7325662642246424, pvalue=0.00032746817200663965)


In [56]:
print(ttest_ind(get_f1(cnb_bal, samples),get_f1(cbc, samples),equal_var=False))

Ttest_indResult(statistic=-1.630863414432309, pvalue=0.10616972335465517)


**Вывод** нам удалось сделать модель, которая в некоторой степени умеет классифицировать сообщения на SPAM и HAM