In [11]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score

In [12]:
VALUES_FILE = 'DATA/valued_articles.txt'
TXT_ROOT = 'DATA/TXT/'
tokenizer = nltk.tokenize.RegexpTokenizer(re.compile('(\w+-?\w*)+'))
needless_types = {'IN','TO', 'POS', 'DT', 'MD', 'RP', 'PRP$', 'PRP', 'CD','CC'}

In [13]:
articles = []
values =[]
with open(VALUES_FILE, 'r') as valued_articles:
    list_of_names = valued_articles.read().strip().split('\n')
    for name in list_of_names:
        values.append(int(name[-1:]))
        with open((TXT_ROOT + name[:-2]).strip()) as txt_file:
            articles.append(txt_file.read())
print('number of articles = ', len(articles))

pos = values.count(1)
neg = values.count(0)
print('number_of_positive = ', pos, ', number_of_negative = ', neg)

number of articles =  186
number_of_positive =  112 , number_of_negative =  74


In [14]:
def take_valuable_words(article): 
    tagged = list(nltk.pos_tag(tokenizer.tokenize(article.lower())))
    words = list()
    for elem in tagged:
        if elem[1] not in needless_types:
            words.append(elem[0])
    return words

In [15]:
X = TfidfVectorizer(tokenizer = take_valuable_words).fit_transform(articles)
print(X.shape)

(186, 7686)


In [16]:
border = (len(articles)//6)*5
X_train, X_test = X[:border], X[border:]
Y_train, Y_test = values[:border], values[border:]

In [17]:
def scorer(estimator, X, Y):
    return roc_auc_score(Y, estimator.predict_proba(X)[:, 1])

In [18]:
searcher = GridSearchCV(
    estimator = SGDClassifier(loss='log'),
    param_grid = {'penalty': ['elasticnet'],
                  'alpha': [0.001, 0.0001, 0.00001, 0.000001, 0.0000001],
                  'l1_ratio': [0.0, 0.01, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5]
    },
    scoring = scorer,
    cv = 5
)

In [19]:
searcher.fit(X_train, Y_train);
print('best score = ', searcher.best_score_)
print('best params = ', searcher.best_params_)

best_cls = searcher.best_estimator_
print('test result = ', scorer(best_cls, X_test, Y_test))

best score =  0.666792187251
best params =  {'alpha': 0.001, 'l1_ratio': 0.5, 'penalty': 'elasticnet'}
test result =  0.668181818182
