# Анализ текстов. Первое задание "Спам фильтр"

In [1]:
import pandas as pd
import numpy as np

In [2]:
ds = pd.read_table("SMSSpamCollection.txt", delimiter="\t", names=("spam_or_ham", "text"))

In [3]:
ds.head()

Unnamed: 0,spam_or_ham,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
text = [text.decode("utf8") for text in ds.text]
labels = [0 if label == "ham" else 1 for label in ds.spam_or_ham]

In [5]:
print labels[:10]
print text[:3]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]
[u'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', u'Ok lar... Joking wif u oni...', u"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]


In [6]:
# Получаем признаки
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text)

In [7]:
# Классификация с лог регрессией
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model = LogisticRegression()
res = cross_val_score(model, X, labels, scoring="f1", cv=10)
print np.mean(res)

0.932640298361


Результаты:
0.932640298361

In [8]:
model.fit(X, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
test = [
"FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"
]
test_features = vectorizer.transform(test)

test_labels = model.predict(test_features)

In [10]:
print test_labels

[1 1 0 0 0]


Результаты:
1 1 0 0 0

In [11]:
def n_gramm_testing_score(vectorizer, model, text, labels):
    X_with_ngrams = vectorizer.fit_transform(text)
    res = cross_val_score(model, X_with_ngrams, labels, scoring="f1", cv=10)
    return round(np.mean(res), 2)

In [12]:
# Добавляем в признаки n-граммы
model = LogisticRegression()

n_gram_vectorizer = CountVectorizer(ngram_range=(2,2))
ngram_2_2_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_2_2_score

n_gram_vectorizer = CountVectorizer(ngram_range=(3,3))
ngram_3_3_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_3_3_score

n_gram_vectorizer = CountVectorizer(ngram_range=(1,3))
ngram_1_3_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_1_3_score

0.82
0.73
0.93


Результат кросс валидации с использованием соответственно только 2грамм, 3грамм, 1-3грамм

0.82, 0.73, 0.93

In [13]:
# Пробуем н-граммы с наивным Байесом
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

n_gram_vectorizer = CountVectorizer(ngram_range=(2,2))
ngram_2_2_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_2_2_score

n_gram_vectorizer = CountVectorizer(ngram_range=(3,3))
ngram_3_3_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_3_3_score

n_gram_vectorizer = CountVectorizer(ngram_range=(1,3))
ngram_1_3_score = n_gramm_testing_score(n_gram_vectorizer, model, text, labels)
print ngram_1_3_score

0.65
0.38
0.89


Байес проседает по качеству на 2граммах и особенно на 3граммах сильнее, чем лог.регрессия

0.65, 0.38, 0.89

In [14]:
# Теперь пробуем TfIdf

from sklearn.feature_extraction.text import TfidfVectorizer
model = LogisticRegression()
# По умолчанию на униграммах
tfidf = TfidfVectorizer()
tfidf_score = n_gramm_testing_score(tfidf, model, text, labels)
print tfidf_score

0.85


Результат: 0.93 CountVectorizer vs 0.85 TfidfVectorizer

# Получение более высокого качества

In [6]:
# Убираем стоп-слова
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
text_without_stopwords = []
for sample in text:
    text_without_stopwords.append(' '.join([word for word in sample.split(' ') if word not in stop_words]))

In [8]:
print text[:3]
print
print text_without_stopwords[:3]

[u'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', u'Ok lar... Joking wif u oni...', u"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

[u'Go jurong point, crazy.. Available bugis n great world la e buffet... Cine got amore wat...', u'Ok lar... Joking wif u oni...', u"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005. Text FA 87121 receive entry question(std txt rate)T&C's apply 08452810075over18's"]


Также, с помощью collections.Counter решил посмотреть на самые встречающиеся слова в текстах. 
Вдруг есть много мусора - который можно убрать.
[(u'I', 1469),
 (u'U', 998),
 (u'', 628),
 (u'CALL', 559),
 (u'2', 457),
 (u'UR', 385),
 (u"I'M", 377),
 (u'GET', 375),
 (u'YOU', 295),
 (u'&LT;#&GT;', 276),
 (u'GO', 265),
 (u'4', 255),
 (u'.', 241),
 (u'LIKE', 236),
 (u'GOT', 235)]
в топ 15 был: '&LT;#&GT;'.
Улучшения качества не последовало.


In [9]:
# Лемматизация
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
text_stopwords_lemmatized = []
for sample in text_without_stopwords:
    text_stopwords_lemmatized.append(' '.join([wnl.lemmatize(i,j[0].lower()) 
                                               if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i)
                                               for i,j in pos_tag(word_tokenize(sample))]))

In [10]:
print text_stopwords_lemmatized[:3]

[u'Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine get amore wat ...', u'Ok lar ... Joking wif u oni ...', u"Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry question ( std txt rate ) T & C 's apply 08452810075over18 's"]


In [25]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

Так как спам связан напрямую с электронной почтой и переписками, возможно, логично использовать TweetTokenizer.


In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
twt = TweetTokenizer()
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,2), tokenizer=twt.tokenize)
X = vectorizer.fit_transform(text_stopwords_lemmatized)

# Сжимаем пространство признаков
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
print X.shape
lsvc = LinearSVC(C=1, penalty="l1", dual=False).fit(X, labels)
mSelect = SelectFromModel(lsvc, prefit=True)
X_new = mSelect.transform(X)
print X_new.shape

(5572, 48455)
(5572, 575)


In [45]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Экспериментально полученный лучший результат по метрикам accuracy & f1:
print "Accuracy:"
for c in [0.01, 0.1, 1, 10, 20, 50, 100]:
    res = cross_val_score(LogisticRegression(C=c,penalty='l2'), X_new, labels, scoring="accuracy", cv=10)
    print np.mean(res)
print "F1 score"
for c in [0.01, 0.1, 1, 10, 20, 50, 100]:
    res = cross_val_score(LogisticRegression(C=c,penalty='l2'), X_new, labels, scoring="f1", cv=10)
    print np.mean(res)

Accuracy:
0.954057756864
0.982051789506
0.987794612865
0.99138657453
0.991744997469
0.99192549823
0.992105031444
F1 score
0.793590595902
0.928685860918
0.952552331298
0.966899141294
0.968291260849
0.969028533459
0.96972744316


In [46]:
# А какие результаты получатся сейчас?
test = [
"FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"
]

# Убираем стоп-слова
test_sw = []
for sample in test:
    test_sw.append(' '.join([word for word in sample.split(' ') if word not in stop_words]))
    
# Лемматизация
test_sw_lemma = []
for sample in test_sw:
    test_sw_lemma.append(' '.join([wnl.lemmatize(i,j[0].lower()) 
                                               if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i)
                                               for i,j in pos_tag(word_tokenize(sample))]))

# Сжимаем пространство признаков
# Обучен на исходной выборке
test_X = vectorizer.transform(test_sw_lemma)
print test_X.shape
selected_test_X = mSelect.transform(test_X)
print selected_test_X.shape

(5, 48455)
(5, 575)


In [47]:
logreg = LogisticRegression(C=50)
logreg.fit(X_new, labels)
predicted_test = logreg.predict(selected_test_X)

In [48]:
print predicted_test
# Ничего не изменилось

[1 1 0 0 0]


# Выводы:   

1. Качество может сильно меняться при смене корпуса. Об этом нужно помнить при переходе к tfidf
2. Статистики по биграммам и 3граммам меньше, чем по униграммам -> классификатор может работать хуже. Но за счет регуляризации лин.классификатор не склонен сильное переобучаться. 
3. Наивный Байес страдает больше, по сравенению с лин.классификатором от нехватки статистики на 2граммах и 3граммах