In [95]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
with open('SMSSpamCollection.txt') as f:
    content = f.readlines()
len(content)

5574

In [78]:
def out(filename, s):
    f = open(filename, 'w')
    f.write(s)
    f.close()

In [79]:
y = np.zeros(len(content))
data = np.empty(len(content), dtype='object')
i = 0
for line in content:
    parts = re.split(r'\s+', line, 1)    
    if parts[0].strip() == 'spam':
        y[i] = 1
    elif parts[0].strip() == 'ham':
        y[i] = 0
    else:
        raise ValueError('Unknown label!')
    
    data[i] = str(parts[1])
    i += 1

In [80]:
cv = CountVectorizer()
X = cv.fit_transform(data)

In [81]:
clf = LogisticRegression(random_state=2)
mean_cv_score = cross_val_score(clf, X, y, scoring='f1', cv=10).mean()
res = '%.1f' % mean_cv_score
out('task5.txt', res)
res

'0.9'

In [82]:
clf.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
data_test = ['FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB',
       'FreeMsg: Txt: claim your reward of 3 hours talk time',
       'Have you visited the last lecture on physics?',
       'Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$',
       'Only 99$']

In [84]:
X_test = cv.transform(data_test)
y_pred = clf.predict(X_test)
res = ' '.join(map(lambda x: str(int(x)), y_pred))
out('task6.txt', res)
res

'1 1 0 0 0'

In [89]:
def test_ngram_range_cv(ngram_rng, clf):
    cv = CountVectorizer(ngram_range=ngram_rng)
    X = cv.fit_transform(data)
    mean_cv_score = cross_val_score(clf, X, y, scoring='f1', cv=10).mean()
    return str('%.2f' % mean_cv_score)

In [90]:
clf = LogisticRegression(random_state=2)
ranges = ' '.join([test_ngram_range((2,2), clf), test_ngram_range((3,3), clf), test_ngram_range((1,3), clf)])
out('task7.txt', ranges)
ranges

'0.82 0.73 0.93'

In [91]:
clf = MultinomialNB()
ranges = ' '.join([test_ngram_range((2,2), clf), test_ngram_range((3,3), clf), test_ngram_range((1,3), clf)])
out('task8.txt', ranges)
ranges

'0.65 0.38 0.89'

In [93]:
def test_ngram_range_tfidf(ngram_rng, clf):
    tfidf = TfidfVectorizer(ngram_range=ngram_rng)
    X = tfidf.fit_transform(data)
    mean_cv_score = cross_val_score(clf, X, y, scoring='f1', cv=10).mean()
    return str('%.2f' % mean_cv_score)

In [96]:
clf = LogisticRegression(random_state=2)
m1 = test_ngram_range_cv((1,1), clf)
m2 = test_ngram_range_tfidf((1,1), clf)
m1, m2

('0.93', '0.85')

In [97]:
out('task9.txt', '-1')