In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [2]:
import re
import codecs
import string
import numpy as np
import xml.etree.ElementTree as ET
import pandas as pd

from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import RussianStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
%matplotlib inline

In [3]:
def xml_to_df(xml_data):
    root = ET.XML(xml_data) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            if subchild.tag == 'evaluation':
                record[subchild.tag] = subchild.text.strip().replace('\n', ' ')
            else:
                record[subchild.tag] = subchild.text.strip().replace('\n', ' ')
        all_records.append(record)
    return pd.DataFrame(all_records)

In [4]:
def tokenize(text):
    tokenizer = RegexpTokenizer('\w+|\S+')
    return tokenizer.tokenize(text)

def stem_text(text):
    r = RussianStemmer()
    return [r.stem(word) for word in text]

def prepare_text(text):
    text = text.lower()
    text = re.sub(r'[\!\"\#\$\%\&\\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', '', text)
    return text

**Загрузим данные в формате XML и запишем их в DataFrame**

In [5]:
xml_train_data = open('train/news_eval_train.xml').read()
xml_test_data = open('test/news_eval_test.xml').read()

In [6]:
df_train = xml_to_df(xml_train_data)
df_test = xml_to_df(xml_test_data)
df_train.head()

Unnamed: 0,evaluation,speech,url
0,0,"Далее в своей проповеди он напомнил, что по би...",http://www.blagovest-info.ru/index.php?ss=2&am...
1,-,Меня отпустили. У Коли @nlyaskin забирают вещи...,http://asiareport.ru/index.php/news/14440-chir...
2,+-,"Мои игроки не разочаровали меня, даже слегка н...",http://www.soccer.ru/news/290704.shtml
3,0,В интервью РИА Новости уполномоченный по права...,http://www.rosbalt.ru/federal/2012/04/08/96718...
4,+-,"Психологи начнут работать с двумя девушками, Д...",http://www.rosbalt.ru/federal/2012/04/08/96718...


**В данных встречаются лишние оценки**

In [7]:
np.unique(df_train.evaluation)

array(['+', '+-', '-', '0'], dtype=object)

In [8]:
np.unique(df_test.evaluation)

array(['+', '+-', '-', '--', '-no', '0', 'n-', 'n0', 'no'], dtype=object)

**Удаляем ненужные**

In [9]:
df_test = df_test[df_test.evaluation != '+-']
df_test = df_test[df_test.evaluation != '--']
df_test = df_test[df_test.evaluation != '-no']
df_test = df_test[df_test.evaluation != 'n-']
df_test = df_test[df_test.evaluation != 'n0']
df_test = df_test[df_test.evaluation != 'no']
df_test.head()

Unnamed: 0,evaluation,speech,url
0,0,"Джазбэнд под руководством Пеппе Сервилло, кото...",http://www.inmsk.ru/news_culture/20120914/3516...
2,-,Посредством этих структур десяткам тысяч избир...,"http://www.dw.de/dw/article/0,,16237071,00.htm..."
3,-,Появилось очень много бедных избирателей. В се...,"http://www.dw.de/dw/article/0,,16237071,00.htm..."
4,-,За теленовостями - главным источником информац...,"http://www.dw.de/dw/article/0,,16237071,00.htm..."
5,-,"Такого раньше никогда не было, чтобы местные ч...","http://www.dw.de/dw/article/0,,16237071,00.htm..."


In [10]:
df_train = df_train[df_train.evaluation != '+-']
df_train.head()

Unnamed: 0,evaluation,speech,url
0,0,"Далее в своей проповеди он напомнил, что по би...",http://www.blagovest-info.ru/index.php?ss=2&am...
1,-,Меня отпустили. У Коли @nlyaskin забирают вещи...,http://asiareport.ru/index.php/news/14440-chir...
3,0,В интервью РИА Новости уполномоченный по права...,http://www.rosbalt.ru/federal/2012/04/08/96718...
7,0,Бывший главный тренер сборной Англии Грэм Тэйл...,http://www.sports.ru/football/139754406.html
8,-,На телах жертв были обнаружены многочисленные ...,http://moldinfo.ru/narod/2939-jitel-vengerskoy...


**Выборки для обучения и тестирования**

In [11]:
X_train = df_train.speech.values
y_train = df_train.evaluation.values

X_test = df_test.speech.values
y_test = df_test.evaluation.values

**Обработаем текст**

Удаляются все знаки пунктуации, текст токенизируется и осуществляется стемминг

In [12]:
def preprocess(X):
    X = [prepare_text(text) for text in X]
    X = [tokenize(text) for text in X]
    X = [stem_text(text) for text in X]
    X = [' '.join(words) for words in X]
    return [prepare_text(text) for text in X]

In [13]:
X_train_prepare = preprocess(X_train)
X_test_prepare = preprocess(X_test)

**Для каждой модели будем брать подсчет энграмм и веса tf-idf**

**Логистическая регрессия**

In [14]:
model_reg = Pipeline([('count', CountVectorizer(ngram_range=(1, 2), min_df=1)), 
                  ('log_reg', LogisticRegression())])
model_reg_tfidf = Pipeline([('tf-idf', TfidfVectorizer(ngram_range=(1, 1), min_df=3)), 
                         ('log_reg', LogisticRegression())])

**Метод опорных векторов**

In [15]:
from sklearn.linear_model import SGDClassifier
model_svm = Pipeline([('count', CountVectorizer(ngram_range=(1, 1), min_df=1)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),])
model_svm_tfidf = Pipeline([('tf-idf', TfidfVectorizer(ngram_range=(1, 1), min_df=1)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),])

**Наивный байесовский классификатор с мультиномиальной моделью событий**

In [16]:
from sklearn.naive_bayes import MultinomialNB
model_nbm = Pipeline([('count', CountVectorizer(ngram_range=(1, 1), min_df=3)), 
                     ('nb', MultinomialNB()),])
model_nbm_tfidf = Pipeline([('tf-idf', TfidfVectorizer(ngram_range=(1, 1), min_df=9)), 
                     ('nb', MultinomialNB()),])

**Наивный байесовский классификатор с Бернуллиевской моделью событий**

In [17]:
from sklearn.naive_bayes import BernoulliNB
model_nbb = Pipeline([('count', CountVectorizer(ngram_range=(1, 1), min_df=1)), 
                     ('nb', BernoulliNB()),])
model_nbb_tfidf = Pipeline([('tf-idf', TfidfVectorizer(ngram_range=(1, 1), min_df=3)), 
                     ('nb', BernoulliNB()),])

**Обучаем модели**

In [18]:
model_reg.fit(X_train_prepare, y_train)
model_reg_tfidf.fit(X_train_prepare, y_train)

Pipeline(memory=None,
     steps=[('tf-idf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [19]:
model_svm.fit(X_train_prepare, y_train)
model_svm_tfidf.fit(X_train_prepare, y_train)

Pipeline(memory=None,
     steps=[('tf-idf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tr...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [20]:
model_nbm.fit(X_train_prepare, y_train)
model_nbm_tfidf.fit(X_train_prepare, y_train)


Pipeline(memory=None,
     steps=[('tf-idf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=9,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tr...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
model_nbb.fit(X_train_prepare, y_train)
model_nbb_tfidf.fit(X_train_prepare, y_train)

Pipeline(memory=None,
     steps=[('tf-idf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tr...  vocabulary=None)), ('nb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

**Предсказываем по текстовой выборке**

In [22]:
preds_reg = model_reg.predict(X_test_prepare)
preds_reg_tfidf = model_reg_tfidf.predict(X_test_prepare)

preds_svm = model_svm.predict(X_test_prepare)
preds_svm_tfidf = model_svm_tfidf.predict(X_test_prepare)

preds_nbm = model_nbm.predict(X_test_prepare)
preds_nbm_tfidf = model_nbm_tfidf.predict(X_test_prepare)

preds_nbb = model_nbb.predict(X_test_prepare)
preds_nbb_tfidf = model_nbb_tfidf.predict(X_test_prepare)

**В качестве оценки выводим F-меру**

In [23]:
print "LogisticRegression: \n", classification_report(y_test, preds_reg)
print "LogisticRegression with tf-idf: \n", classification_report(y_test, preds_reg_tfidf)

print "SGDClassifier: \n", classification_report(y_test, preds_svm)
print "SGDClassifier with tf-idf: \n", classification_report(y_test, preds_svm_tfidf)

print "MultinomialNB: \n", classification_report(y_test, preds_nbm)
print "MultinomialNB with tf-idf: \n", classification_report(y_test, preds_nbm_tfidf)

print "BernoulliNB: \n", classification_report(y_test, preds_nbb)
print "BernoulliNB with tf-idf: \n", classification_report(y_test, preds_nbb_tfidf)

LogisticRegression: 
             precision    recall  f1-score   support

          +       0.57      0.66      0.61      1448
          -       0.66      0.77      0.71      1890
          0       0.53      0.30      0.39      1235

avg / total       0.60      0.61      0.59      4573

LogisticRegression with tf-idf: 
             precision    recall  f1-score   support

          +       0.61      0.64      0.62      1448
          -       0.62      0.86      0.72      1890
          0       0.60      0.20      0.30      1235

avg / total       0.61      0.61      0.57      4573

SGDClassifier: 
             precision    recall  f1-score   support

          +       0.53      0.68      0.59      1448
          -       0.70      0.65      0.68      1890
          0       0.46      0.36      0.40      1235

avg / total       0.58      0.58      0.58      4573

SGDClassifier with tf-idf: 
             precision    recall  f1-score   support

          +       0.64      0.58      0.61  

**Отсортированные результаты**

| BernoulliNB tf-idf | MultinomialNB   | LogisticRegression | SGDClassifier   | LogisticRegression tf-idf | MultinomialNB tf-idf   | BernoulliNB |  SGDClassifier tf-idf  |
|------|------|------|------|------|------|------|------|
|    0.62  | 0.61 |   0.59  | 0.58|  0.57   | 0.55|   0.54  |0.53|

*Дополнительно была рассомтрена модель с Gradient Boosting*

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = Pipeline([('count', CountVectorizer(ngram_range=(1, 1))), 
                  ('grad_boost', GradientBoostingClassifier(max_depth=10, verbose=True, n_estimators=100))])
model_gb_tfidf = Pipeline([('count', TfidfVectorizer(ngram_range=(1, 1), min_df=1)), 
                         ('grad_boost', GradientBoostingClassifier(max_depth=10, verbose=True, n_estimators=100))])

In [25]:
model_gb.fit(X_train_prepare, y_train)
preds_bg = model_gb.predict(X_test_prepare)

      Iter       Train Loss   Remaining Time 
         1        4011.3752           37.92s
         2        3889.3819           37.98s
         3        3776.9388           37.96s
         4        3671.5761           38.40s
         5        3589.6046           38.32s
         6        3510.7869           38.76s
         7        3440.9756           38.50s
         8        3381.6132           37.81s
         9        3311.6199           37.74s
        10        3245.0917           37.53s
        20        2809.8752           31.80s
        30        2549.7547           25.99s
        40        2371.8570           21.03s
        50        2226.1708           16.80s
        60        2102.8243           13.00s
        70        1991.7791            9.47s
        80        1903.9930            6.14s
        90        1822.2759            2.99s
       100        1753.0031            0.00s


In [26]:
model_gb_tfidf.fit(X_train_prepare, y_train)
preds_bg_tfidf = model_gb_tfidf.predict(X_test_prepare)

      Iter       Train Loss   Remaining Time 
         1        4002.4587           49.10s
         2        3869.1866           49.54s
         3        3742.4583           49.66s
         4        3635.7862           49.49s
         5        3537.3250           49.23s
         6        3444.3293           49.18s
         7        3373.2171           48.56s
         8        3301.4498           47.92s
         9        3227.9904           48.01s
        10        3162.2816           47.40s
        20        2717.3342           40.00s
        30        2443.9638           33.48s
        40        2239.3267           27.81s
        50        2074.4934           22.62s
        60        1941.9084           17.80s
        70        1829.0827           13.16s
        80        1729.8579            8.69s
        90        1638.9007            4.31s
       100        1553.8825            0.00s


In [27]:
print "GradientBoostingClassifier: \n", classification_report(y_test, preds_bg)
print "GradientBoostingClassifier with tf-idf: \n", classification_report(y_test, preds_bg_tfidf)

GradientBoostingClassifier: 
             precision    recall  f1-score   support

          +       0.59      0.58      0.58      1448
          -       0.58      0.82      0.68      1890
          0       0.46      0.17      0.24      1235

avg / total       0.55      0.57      0.53      4573

GradientBoostingClassifier with tf-idf: 
             precision    recall  f1-score   support

          +       0.58      0.56      0.57      1448
          -       0.58      0.79      0.67      1890
          0       0.47      0.22      0.30      1235

avg / total       0.55      0.56      0.54      4573

