# Загрузка данных + импорты

## Импорты

In [12]:
#импорты
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score


In [13]:
FILESOURCE = 'SMSSpamCollection.txt' # файл источник


## Чтение файла

In [14]:
# читаем файл
with open(FILESOURCE) as f: 
    full_file = f.read()
print(full_file[:20])

ham	Go until jurong 


**Вывод:** данные удалось прочитать, уже неплохо

# Предобработка

In [15]:
# нижний регистр и разделение на строки.
full_file = full_file.lower()
lst = full_file.splitlines()

In [16]:
# Отделение таргета
lst_with_target = []
for line in lst:
    lst_tmp = []
    lst_line = line.split('\t')  
    if lst_line[0] == 'spam' or lst_line[0] == 'ham':
        lst_tmp.append('ok')
        lst_tmp.append(lst_line[0])
        lst_tmp.append(lst_line[1:])
        lst_with_target.append(lst_tmp)
    else:
        lst_tmp.append('not_ok')
        lst_tmp.append(lst_line[0])
        lst_tmp.append(lst_line[1:])
        lst_with_target.append(lst_tmp)

In [17]:
# проверка на структуру файла
for i in range(len(lst_with_target)):
    if lst_with_target[i][0] != 'ok':
        print(lst_with_target[i], 'is not ok')
    if len(lst_with_target[i]) > 3:
        print(lst_with_target[i], '>3')

**Вывод:** структура файла соблюдается

In [18]:
# Составим множество всех слов в тексте кроме служебных и разобьем строки на отдельные слова
stopwords = set(stopwords.words('english'))
total_words = set()
for i in range(len(lst_with_target)):
    lst_with_target[i].append(word_tokenize(lst_with_target[i][2][0]))
    total_words.update(set(word_tokenize(lst_with_target[i][2][0])).difference(stopwords))

In [53]:
# Превратим  наше множество в list
total_words_lst = list(total_words)
print(len(total_words_lst))

9311


In [41]:
# Заполнение таргета
total_dct = {'target': []}
for i in range(len(lst_with_target)):
    if lst_with_target[i][1] == 'spam':
        total_dct['target'].append(1)
    else:
        total_dct['target'].append(0)


In [42]:
# Заполнение признаков для DF
for word in total_words_lst:    
    total_dct[word] = []

    for i in range(len(lst_with_target)):
        cnt = 0
        cnt = lst_with_target[i][3].count(word)
        total_dct[word].append(cnt)


In [43]:
# Создаем df
df = pd.DataFrame(total_dct)
df.head()

Unnamed: 0,target,you'ld,administrator,copy,joined.hope,wifes,scratches,tim,1-u,natalja,...,first-,sugababes,nuclear,obey,hahaha,accommodation,vote,09066368753,68866.,royal
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Создаем транспонированый df
df_t = df.T

In [52]:
# разделение
features = df.drop('target',axis=1)
target = df['target']
features_train, features_test, target_train, target_test = train_test_split(features,target, test_size=.3, random_state=123456)
print(features_train.shape, features_test.shape, target_train.shape, target_test.shape )

(3901, 9311) (1673, 9311) (3901,) (1673,)


# EDA

## Баланс классов

### Общий баланс классов

In [48]:
df['target'].mean()

0.1340150699677072

### Баланс классов train

In [54]:
target_train.mean()

0.12971033068443988

### Баланс классов test

In [55]:
target_test.mean()

0.14405260011954574

## Поиск важных слов


In [61]:
df[df['target'] == 1].sum().sort_values(ascending=False)[:25]

.         924
target    747
!         545
,         378
call      346
free      219
&         178
?         175
2         174
:         163
txt       156
ur        144
u         142
mobile    123
text      121
4         120
stop      118
*         115
claim     113
reply     104
prize      92
get        84
's         72
new        69
send       69
dtype: int64

In [66]:
df[(df['call'] != 0) | (df['free'] != 0 )]['target'].mean()

0.61794500723589

**Вывод:** Можно утверждать, что в spam-сообщениях часто употребляются такие слова как call и free 

## Модель

In [67]:
cnb = ComplementNB()

In [68]:
cnb.fit(features_train,target_train)

ComplementNB()

In [82]:
prediction_train = cnb.predict(features_train)
accuracy_train = accuracy_score(target_train, prediction_train)
precision_train = precision_score(target_train, prediction_train)
recall_train = recall_score(target_train, prediction_train)
f1_train = f1_score(target_train, prediction_train)
print(f'accuracy_train = {accuracy_train} \n\
precision_train = {precision_train} \n\
recall_train = {recall_train}\n\
f1_train = {f1_train}')


accuracy_train = 0.9833376057421174 
precision_train = 0.903107861060329 
recall_train = 0.9762845849802372
f1_train = 0.9382716049382717


In [84]:
prediction_test = cnb.predict(features_test)
accuracy_test = accuracy_score(target_test, prediction_test)
precision_test = precision_score(target_test, prediction_test)
recall_test = recall_score(target_test, prediction_test)
f1_test = f1_score(target_test, prediction_test)
print(f'accuracy_test = {accuracy_test} \n\
precision_test = {precision_test} \n\
recall_test = {recall_test}\n\
f1_test = {f1_test}')

accuracy_test = 0.9521817095038853 
precision_test = 0.7747440273037542 
recall_test = 0.941908713692946
f1_test = 0.850187265917603


**Вывод** нам удалось сделать модель, которая в некоторой степени умеет классифицировать сообщения на SPAM и HAM