# Домашнее задание 1

## Павел Яковенко, гр. 295

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score



Загрузим датасет.

In [2]:
DATASET_PATH = "smsspamcollection/SMSSpamCollection"

In [3]:
def load_dataset():
    X = []
    y = []
    with open(DATASET_PATH, 'r') as f:
        for line in f.readlines():
            label, text = line.strip().split('\t')
            y.append(1 if label == "spam" else 0)
            X.append(text)
    return X, y

In [4]:
def f1_score_custom(clf, X, y):
    predicted = clf.predict(X)
    return f1_score(y, predicted)

In [5]:
X, y = load_dataset()

Извлекаем признаки из текстов и обучаем простую логистическую регрессию.

In [6]:
vect = CountVectorizer()
X_features = vect.fit_transform(X)

In [15]:
clf = LogisticRegression()
scores = cross_val_score(clf, X_features, y, cv=10, scoring=f1_score_custom, n_jobs=-1)
print "score:", scores.mean()

score: 0.933348526858


In [20]:
clf = LogisticRegression()
clf.fit(X_features, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
test_sms = [
    "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
    "FreeMsg: Txt: claim your reward of 3 hours talk time",
    "Have you visited the last lecture on physics?",
    "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
    "Only 99$"
]

In [24]:
test_features = vect.transform(test_sms)
predictions_test = clf.predict(test_features)
print "Predictions on test:", ' '.join(map(str, predictions_test))

 Predictions on test: 1 1 0 0 0


Качество мы получили в 0.93 f1-меры. 

Теперь проведем сравнение различных параметров для CountVectorized.

In [None]:
def test_classifier(clf, X, y):
    return cross_val_score(clf, X, y, cv=10, scoring=f1_score_custom, n_jobs=-1).mean()

In [36]:
def ngram_size_test(X, y, clf_class, vect_class):
    vect = vect_class(ngram_range=(2,2))
    X_features = vect.fit_transform(X)
    print "{:.2%}".format(test_classifier(clf_class(), X_features, y))
    
    vect = vect_class(ngram_range=(3,3))
    X_features = vect.fit_transform(X)
    print "{:.2%}".format(test_classifier(clf_class(), X_features, y))
    
    vect = vect_class(ngram_range=(1,3))
    X_features = vect.fit_transform(X)
    print "{:.2%}".format(test_classifier(clf_class(), X_features, y))

In [38]:
ngram_size_test(X, y, LogisticRegression, CountVectorizer)

82.24%
72.50%
92.51%


In [39]:
from sklearn.naive_bayes import MultinomialNB
ngram_size_test(X, y, MultinomialNB, CountVectorizer)

64.55%
37.86%
88.79%


Как можно увидеть, наивный Байес действительно страдает от нехватки статистики по биграммам и триграммам.

Далее изменим метод получения признаков.

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
X_features = vect.fit_transform(X)
print "{:.2%}".format(test_classifier(LogisticRegression(), X_features, y))

85.29%


In [40]:
print "LogisticRegression:"
ngram_size_test(X, y, LogisticRegression, TfidfVectorizer)
print "\nMultinomialNB:"
ngram_size_test(X, y, MultinomialNB, TfidfVectorizer)

LogisticRegression:
34.46%
16.56%
64.84%

MultinomialNB:
74.76%
63.34%
72.47%


Как можно увидеть, данный метод оказался хуже, чем CountVectorizer.

Попытаемся подобрабрать метапараметры классификаторов. 

In [74]:
from sklearn.model_selection import GridSearchCV
def find_best_params(X, y, clf_class, vect_class, param_grid):
    vect = vect_class()
    X_features = vect.fit_transform(X)
    clf = GridSearchCV(clf_class(), param_grid, n_jobs=-1, cv=10, scoring=f1_score_custom)
    clf.fit(X_features, y)
    return clf

In [50]:
param_grid = {
    'penalty': ['l2'],
    'C': np.linspace(10**-5, 10**5, num=20),
    'max_iter': map(int, np.linspace(100, 10000, num=20)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
}
best_log_clf = find_best_params(X, y, LogisticRegression, CountVectorizer, param_grid)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [54]:
clf = LogisticRegression()
clf.set_params(**best_log_clf.best_params_)
clf.fit(X_features, y)

LogisticRegression(C=63157.894740526317, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [55]:
test_classifier(clf, X_features, y)

0.93749501734722906

In [83]:
param_grid = {
    'alpha': np.linspace(10**-10, 10**10, num=100),
}
best_bayes_clf = find_best_params(X, y, MultinomialNB, CountVectorizer, param_grid)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [84]:
clf = MultinomialNB()
clf.set_params(**best_bayes_clf.best_params_)
clf.fit(X_features, y)
test_classifier(clf, X_features, y)

0.92391675719683897

In [85]:
param_grid = {
    'alpha': np.linspace(10**-10, 10**10, num=100),
}
best_bayes_clf = find_best_params(X, y, MultinomialNB, TfidfVectorizer, param_grid)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [86]:
clf = MultinomialNB()
clf.set_params(**best_bayes_clf.best_params_)
clf.fit(X_features, y)
test_classifier(clf, X_features, y)

0.92391675719683897

In [88]:
param_grid = {
    'penalty': ['l2'],
    'C': np.linspace(10**-5, 10**5, num=10),
    'max_iter': map(int, np.linspace(100, 10000, num=10)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
}
best_log_clf = find_best_params(X, y, LogisticRegression, TfidfVectorizer, param_grid)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [90]:
clf = LogisticRegression()
clf.set_params(**best_log_clf.best_params_)
clf.fit(X_features, y)
test_classifier(clf, X_features, y)

0.94035288854322796

In [92]:
best_log_clf.best_params_

{'C': 77777.777780000004, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'sag'}

Лучшие результаты получились при логистической регрессии c параметрами из предыдущей ячейки. Для получения признаков использовался метод TfidfVectorizer. 

Лучший результат по f1-мере: 0.94