### Загружаем данные

In [1]:
with open("SMSSpamCollection") as f:
    lines = f.readlines()
    texts = []
    answers = []
    
    for line in lines:
        parts = line.split("\t", 1)
        
        if parts[0] == "ham":
            answers.append(0)
        else:
            answers.append(1)
        
        texts.append(parts[1])

In [2]:
from sklearn import feature_extraction
from sklearn.cross_validation import cross_val_score

test_messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use fromyour phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"]

def DoClassification(vectorizer, classifier, y=answers):
    fitted_vectorizer = vectorizer.fit(texts)
    
    X = fitted_vectorizer.transform(texts)
    
    result = cross_val_score(classifier, X, y, cv=10, scoring="f1")
    print("Cross_val_score result: ", "{:.2f}".format(sum(result)/len(result)))
    
    classifier.fit(X, y)
    print("Predicted vals: ", classifier.predict(fitted_vectorizer.transform(test_messages)))
    



### Логистическая регрессия

In [3]:
from sklearn.linear_model import LogisticRegression

DoClassification(feature_extraction.text.CountVectorizer(), LogisticRegression())

Cross_val_score result:  0.93
Predicted vals:  [1 1 0 0 0]


In [4]:
ngram_ranges = [(2,2), (3,3), (1,3)]

for ngram_range in ngram_ranges:
        DoClassification(feature_extraction.text.CountVectorizer(ngram_range=ngram_range), LogisticRegression())

Cross_val_score result:  0.82
Predicted vals:  [1 0 0 0 0]
Cross_val_score result:  0.73
Predicted vals:  [0 0 0 0 0]
Cross_val_score result:  0.93
Predicted vals:  [1 0 0 0 0]


### Наивный Байес

In [5]:
from sklearn.naive_bayes import MultinomialNB

DoClassification(feature_extraction.text.CountVectorizer(), MultinomialNB())

Cross_val_score result:  0.93
Predicted vals:  [1 1 0 0 1]


In [6]:
for ngram_range in ngram_ranges:
        DoClassification(feature_extraction.text.CountVectorizer(ngram_range=ngram_range), MultinomialNB())

Cross_val_score result:  0.65
Predicted vals:  [1 1 0 0 0]
Cross_val_score result:  0.38
Predicted vals:  [1 1 0 0 0]
Cross_val_score result:  0.89
Predicted vals:  [1 1 0 0 0]


### TfIdf

In [7]:
DoClassification(feature_extraction.text.TfidfVectorizer(), LogisticRegression())

Cross_val_score result:  0.85
Predicted vals:  [1 1 0 0 0]


### Выводы

Наши эксперименты подтверждают теорию о том, что логрегрессия лучше решает задачу классификации спам/не спам, чем наивный Байесовский классификатор.


Использование метрики tf-idf ухудшило качество классификации на данном датасете. Одной из причин подобного поведения может являться тот факт, что спам легко отличим по наиболее часто встречающимся словам, которые учитываются слабее в метрике tf-idf.