In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from itertools import product
from nltk.corpus import stopwords
%matplotlib inline

Загружаем данные

In [2]:
data = pd.read_csv("products_sentiment_train.tsv", sep='\t', header=None, names=['text', 'target'])
data.head()

Unnamed: 0,text,target
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [3]:
data.iloc[1].text

'i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .'

In [4]:
data.iloc[3].text

'i dont especially like how music files are unstructured ; basically they are just dumped into one folder with no organization , like you might have in windows explorer folders and subfolders .'

In [5]:
print "Размер выборки: %d" % data.shape[0]

Размер выборки: 2000


Посмотрим на доли классов в выборке

In [6]:
part_pos = data.target.sum()/data.shape[0]
print "Доля положительных отзывов: %lf, доля отрицательных отзывов: %lf" %(part_pos, 1 - part_pos)

Доля положительных отзывов: 0.637000, доля отрицательных отзывов: 0.363000


Доля положительных отзывов преобладает в выборке, но сильного дисбаланса классов нет.

Будем извлекать признаки из текста, попробуем использовать как CountVectorizer, так и TF-IDF. В качестве методов классификации будем рассматривать линейные методы, в частности, логистическую регрессию.

In [7]:
%%time
pipe = Pipeline([('Vectorizer', TfidfVectorizer()), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=5)

CPU times: user 328 ms, sys: 8 ms, total: 336 ms
Wall time: 344 ms


In [8]:
scores.mean()

0.7665031843949025

Создадим ряд моделей с использованием различных методов обработки признаков и обучения.

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=False)
for vectorizer, classifier in product([CountVectorizer(), TfidfVectorizer()], 
                                      [LogisticRegression(), LinearSVC(), SGDClassifier()]):
    pipe = Pipeline([('Vectorizer', vectorizer), ('Classifier', classifier)])
    
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    print "###############################################"
    print "Vectorizer"
    print vectorizer
    print "Clasifier"
    print classifier
    print "CV score mean: %lf,std: %lf" % (scores.mean(), scores.std())
    print "###############################################"

###############################################
Vectorizer
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
Clasifier
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
CV score mean: 0.768496,std: 0.007634
###############################################
###############################################
Vectorizer
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content

Лучший результат показал Tf-Idf с LinearSVC, но также близким является CountVectorizer с LogisticRegression.

# TF-IDF + LinearSVC

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

Baseline - решение.

In [11]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer() ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

0.77196943105894411

Далее будем настраивать параметры Tf-Idf. Начнем с strip_accents. Будем использовать 'ascii' так именно в этой кодировке записан текст, 'uniode' использовать смысла нет в данном случае.

In [12]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(strip_accents='ascii') ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

0.77152316576978597

Данный параметр не влияет на качество. Рассмотрим использование фильтра стоп-слов.

In [13]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(stop_words='english') ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

0.74400188126175792

In [14]:
import nltk.

SyntaxError: invalid syntax (<ipython-input-14-1be091cf139f>, line 1)

Качество ухудшилось. Попробуем использовать nltk.corpus.stopwords.words().

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(stop_words=stopwords.words('english')) ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

Все равно качество ухудшилось.

Подберем оптимальное значение min_df (минимальная частота встречаемости слова в документах). 

In [None]:
res_list = []
for m in range(1,14):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(min_df=m) ), ('Classifier', LogisticRegression())])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

In [None]:
plt.plot(range(1,14), res_list)
plt.show()

Видно, что качество ухудшается с ростом значения этого параметра.

Попробуем использовать max_df

In [None]:
res_list = []
for m in np.arange(0.3, 1.0, 0.05):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(max_df=m) ), ('Classifier', LogisticRegression())])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

In [None]:
plt.plot(np.arange(0.3, 1.0, 0.05), res_list)
plt.show()

Лучше использовать.

In [None]:
res_list = []
for m in np.arange(0.3, 0.4, 0.005):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(max_df=m) ), ('Classifier', LogisticRegression())])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

In [None]:
plt.plot(np.arange(0.3, 0.4, 0.005), res_list)
plt.show()
print "Max. arg.: %lf" % (np.arange(0.3, 0.4, 0.005)[np.array(res_list).argmax()])

Использование max_df = 0.345 повышает качество модели.

Попробуем добавить в модель N-граммы.

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(1,2)) ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(1,3)) ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

Качество ухудшается. Попробуем использовать в качестве analyzer 'char_wb'.

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3,5), analyzer='char_wb') ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

Качество улучшилось, если сравнивать с baseline-решением. Попробуем подобрать оптимальный параметр для ngram_range.

In [None]:
res_list = []
for n in range(5, 10):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3, n), analyzer='char_wb') ), ('Classifier', LogisticRegression())])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

In [None]:
plt.plot(range(5,10), res_list)
plt.show()

Качество ухудшается с ростом n.

Подберем параметры LogisticRegression.

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3,5), analyzer='char_wb') ), ('Classifier', LogisticRegression())])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

In [None]:
res_list = []
for C in np.linspace(1e-4,1e4,10):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3, 5), analyzer='char_wb')), ('Classifier', LogisticRegression(C=C))])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

plt.plot(np.linspace(1e-4,1e4,10), res_list)
plt.show()

In [None]:
res_list = []
for C in np.linspace(0.1,10.1,10):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3, 5), analyzer='char_wb')), ('Classifier', LogisticRegression(C=C))])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

plt.plot(np.linspace(0.1,10.1,10), res_list)
plt.show()

In [None]:
res_list = []
for C in np.linspace(2,4,30):
    pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3,5), analyzer='char_wb')), ('Classifier', LogisticRegression(C=C))])
    scores = cross_val_score(pipe, data.text, data.target, cv=skf)
    res_list.append(scores.mean())

print np.linspace(2,4,30)[np.array(res_list).argmax()], np.array(res_list).max()

plt.plot(np.linspace(2,4,30), res_list)
plt.show()

In [None]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3, 5), analyzer='char_wb')), ('Classifier', LogisticRegression(C=3.2413793103448274))])
scores = cross_val_score(pipe, data.text, data.target, cv=skf)
scores.mean()

In [107]:
pipe = Pipeline([('Vectorizer', TfidfVectorizer(ngram_range=(3, 5), analyzer='char_wb') ), ('Classifier', LogisticRegression(C=3.2413793103448274))])
pipe.fit(data.text, data.target)

Pipeline(steps=[('Vectorizer', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 5), norm=u'l2', preprocessor=None, smooth_...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [108]:
data_test = pd.read_csv("products_sentiment_test.tsv", sep='\t', index_col='Id')

In [109]:
data_test.head()

Unnamed: 0_level_0,text
Id,Unnamed: 1_level_1
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."


In [110]:
y_pred = pipe.predict(data_test.text)

In [111]:
data_pred = data_test.drop('text', axis=1)
data_pred['y'] = y_pred

In [113]:
data_pred.head()

Unnamed: 0_level_0,y
Id,Unnamed: 1_level_1
0,1
1,0
2,1
3,1
4,0


In [114]:
data_pred.to_csv("pred.csv")