In [1]:
%pylab inline
import matplotlib.pyplot as plt

import numpy as np
import gensim
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from nltk.corpus import stopwords

import scipy.sparse as sp

from sklearn.naive_bayes import MultinomialNB

import sklearn.metrics

import pymorphy2 as pm2

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Populating the interactive namespace from numpy and matplotlib


Using Theano backend.


#### Загрузим данные

In [2]:
category_raw = pd.read_csv('data/category.csv', sep=',', quoting=0, encoding = 'utf-8')
test_raw = pd.read_csv('data/test.csv', sep=',', quoting=0, encoding = 'utf-8')
train_raw = pd.read_csv('data/train.csv', sep=',', quoting=0, encoding = 'utf-8')

In [3]:
test = test_raw.copy()
test['title'] = test_raw['title'].str.lower()
test['description'] = test_raw['description'].str.lower()

train = train_raw.copy()
train['title'] = train_raw['title'].str.lower()
train['description'] = train_raw['description'].str.lower()

---

#### Уберем все лишние символы, кроме букв рус, анг и цифр. Разделим русские и английские слова.

In [4]:
train['title'] = train['title'].str.replace(u'[^a-zа-я0-9]',' ')
train['description'] = train['description'].str.replace(u'[^a-zа-я0-9]',' ')

test['title'] = test['title'].str.replace(u'[^a-zа-я0-9]',' ')
test['description'] = test['description'].str.replace(u'[^a-zа-я0-9]',' ')

In [5]:
test['title eng'] = test['title'].str.extract(u'([a-z]+[0-9a-z ]*)', expand=False)
test['title eng all'] = test['title eng'].str.replace(u'[ ]','')
test['title ru'] = test['title'].str.replace(u'[^а-я ]', '')

train['title eng'] = train['title'].str.extract(u'([a-z]+[0-9a-z ]*)', expand=False)
train['title eng all'] = train['title eng'].str.replace(u'[ ]','')
train['title ru'] = train['title'].str.replace(u'[^а-я ]', '')

In [6]:
test['description eng'] = test['description'].str.extract(u'([a-z]+[0-9a-z ]*)', expand=False)
test['description eng all'] = test['description eng'].str.replace(u'[ ]','')
test['description ru'] = test['description'].str.replace(u'[^а-я ]', '')

train['description eng'] = train['description'].str.extract(u'([a-z]+[0-9a-z ]*)', expand=False)
train['description eng all'] = train['description eng'].str.replace(u'[ ]','')
train['description ru'] = train['description'].str.replace(u'[^а-я ]', '')

---

#### Нормализуем русские слова

In [7]:
morph = pm2.MorphAnalyzer()

def f_tokenizer(string, fmt=['NOUN']):
    result = ''
    for wrd in string.split(' '):
        wrd_prs = morph.parse(wrd)[0]
        # 'NUMR','PREP','CONJ','PRCL','INTJ','ADJF'
        if wrd_prs.tag.POS in fmt:
            result = result + ' ' + wrd_prs.normal_form
    return result

In [None]:
test['title ru tok'] = test['title ru'].apply(lambda ttl: f_tokenizer(ttl, fmt=['NOUN', 'ADJF', 'PRTF']))
train['title ru tok'] = train['title ru'].apply(lambda ttl: f_tokenizer(ttl, fmt=['NOUN', 'ADJF', 'PRTF']))

test['description ru tok'] = test['description ru'].apply(lambda ttl: f_tokenizer(ttl, fmt=['NOUN']))
train['description ru tok'] = train['description ru'].apply(lambda ttl: f_tokenizer(ttl, fmt=['NOUN']))

---

#### Воспользуемся тестовой выгрузкой вместо стоп слов

In [9]:
title_vectorizer_test = CountVectorizer()
title_vector_test = title_vectorizer_test.fit_transform(test['title ru tok'])

title_vectorizer_train = CountVectorizer()
title_vector_train = title_vectorizer_train.fit_transform(train['title ru tok'])

title_words = set(title_vectorizer_test.get_feature_names()).intersection(
    set(title_vectorizer_train.get_feature_names()))

print(len(title_vectorizer_test.get_feature_names()), 
      len(title_vectorizer_train.get_feature_names()), 
      len(title_words))

29026 41189 19672


In [10]:
title_vectorizer = CountVectorizer(vocabulary=title_words)

title_vector_train = title_vectorizer.transform(train['title ru tok'])
title_vector_test = title_vectorizer.transform(test['title ru tok'])

In [11]:
desc_vectorizer_test = CountVectorizer()
desc_vector_test = desc_vectorizer_test.fit_transform(test['description ru tok'])

desc_vectorizer_train = CountVectorizer()
desc_vector_train = desc_vectorizer_train.fit_transform(train['description ru tok'])

desc_words = set(desc_vectorizer_test.get_feature_names()).intersection(
    set(desc_vectorizer_train.get_feature_names()))

print(len(desc_vectorizer_test.get_feature_names()), 
      len(desc_vectorizer_train.get_feature_names()), 
      len(desc_words))

59128 84436 39742


In [12]:
desc_vectorizer = CountVectorizer(vocabulary=desc_words)

desc_vector_test = desc_vectorizer.transform(test['description ru tok'])
desc_vector_train = desc_vectorizer.transform(train['description ru tok'])

In [13]:
title_vectorizer_test_eng = CountVectorizer()
title_vector_test_eng = title_vectorizer_test_eng.fit_transform(test['title eng'].fillna(''))

title_vectorizer_train_eng = CountVectorizer()
title_vector_train_eng = title_vectorizer_train_eng.fit_transform(train['title eng'].fillna(''))

title_words_eng = set(title_vectorizer_test_eng.get_feature_names()).intersection(
    set(title_vectorizer_train_eng.get_feature_names()))

print(len(title_vectorizer_test_eng.get_feature_names()), 
      len(title_vectorizer_train_eng.get_feature_names()), 
      len(title_words_eng))

41805 64942 23433


In [14]:
title_vectorizer_eng = CountVectorizer(vocabulary=title_words_eng)

title_vector_train_eng = title_vectorizer_eng.transform(train['title eng'].fillna(''))
title_vector_test_eng = title_vectorizer_eng.transform(test['title eng'].fillna(''))

In [15]:
%%time
desc_vectorizer_test_eng = CountVectorizer()
desc_vector_test_eng = desc_vectorizer_test_eng.fit_transform(test['description eng'].fillna(''))

desc_vectorizer_train_eng = CountVectorizer()
desc_vector_train_eng = desc_vectorizer_train_eng.fit_transform(train['description eng'].fillna(''))

desc_words_eng = set(desc_vectorizer_test_eng.get_feature_names()).intersection(
    set(desc_vectorizer_train_eng.get_feature_names()))

print(len(desc_vectorizer_test_eng.get_feature_names()), 
      len(desc_vectorizer_train_eng.get_feature_names()), 
      len(desc_words_eng))

42441 66110 23747
CPU times: user 3.85 s, sys: 11 ms, total: 3.86 s
Wall time: 3.86 s


In [16]:
desc_vectorizer_eng = CountVectorizer(vocabulary=desc_words_eng)

desc_vector_test_eng = desc_vectorizer_eng.transform(test['description eng'].fillna(''))
desc_vector_train_eng = desc_vectorizer_eng.transform(train['description eng'].fillna(''))

#### Объединим нормализованные данные в единый мешок слов 

In [17]:
train_hs = sp.hstack([np.array(train['price'])[:,None], title_vector_train, desc_vector_train, 
                      title_vector_train_eng, desc_vector_train_eng])
test_hs = sp.hstack([np.array(test['price'])[:,None], title_vector_test, desc_vector_test
                    , title_vector_test_eng, desc_vector_test_eng])

In [18]:
train_hs

<489517x106595 sparse matrix of type '<class 'numpy.float64'>'
	with 8608287 stored elements in COOrdinate format>

---

---

#### Разделим данные на тестовую и обучающую выборки

In [19]:
x_train, x_test, y_train, y_test = train_test_split(train_hs, train.category_id, 
                                                    test_size=0.2, stratify=train.category_id)

#### Обучим наивный байесовский классификатор для сравнения

In [20]:
clf = MultinomialNB().fit(x_train, y_train)

In [21]:
predictednb = clf.predict(x_test)

In [22]:
print(sklearn.metrics.classification_report(y_test, predictednb, target_names=category_raw['name']))

                                                                     precision    recall  f1-score   support

                                Бытовая электроника|Телефоны|iPhone       0.45      0.93      0.61      1772
                                       Бытовая электроника|Ноутбуки       0.90      0.85      0.88      1604
                               Бытовая электроника|Телефоны|Samsung       0.84      0.70      0.76      1977
          Бытовая электроника|Планшеты и электронные книги|Планшеты       0.88      0.66      0.75      1721
  Бытовая электроника|Игры, приставки и программы|Игровые приставки       0.80      0.88      0.84      1723
           Бытовая электроника|Аудио и видео|Телевизоры и проекторы       0.87      0.87      0.87      1648
                          Бытовая электроника|Телефоны|Другие марки       0.62      0.55      0.58      1739
                          Бытовая электроника|Настольные компьютеры       0.81      0.93      0.87      1718
 Бытовая электрони

In [23]:
accuracy_score(y_test, predictednb)

0.72807035463310998

---

#### Обучим LightGbm 

In [24]:
dtrain = lgb.Dataset(x_train, y_train, max_bin = 2)
dtest = lgb.Dataset(x_test, y_test, max_bin = 2)

In [25]:
best_params = {
    'application' : 'multiclass',
    'boosting_type' : 'gbdt',
    'metric' : 'multi_error',
    'num_class' : 54,
    'num_threads' : 7,
    
    'learning_rate' : 0.1,
    'num_leaves' : 211,

    'min_data_in_leaf': 4,
    
    'bagging_fraction':1,
    'feature_fraction':0.2,
     }

In [None]:
model = lgb.train(best_params, dtrain, num_boost_round=153, verbose_eval=None)
predicted2a = model.predict(x_test)

In [38]:
print(sklearn.metrics.classification_report(y_test, predicted2a.argmax(axis=1), target_names=category_raw['name']))

                                                                     precision    recall  f1-score   support

                                Бытовая электроника|Телефоны|iPhone       0.94      0.94      0.94      1772
                                       Бытовая электроника|Ноутбуки       0.96      0.96      0.96      1604
                               Бытовая электроника|Телефоны|Samsung       0.91      0.91      0.91      1977
          Бытовая электроника|Планшеты и электронные книги|Планшеты       0.96      0.93      0.95      1721
  Бытовая электроника|Игры, приставки и программы|Игровые приставки       0.94      0.93      0.94      1723
           Бытовая электроника|Аудио и видео|Телевизоры и проекторы       0.95      0.96      0.96      1648
                          Бытовая электроника|Телефоны|Другие марки       0.82      0.85      0.84      1739
                          Бытовая электроника|Настольные компьютеры       0.97      0.96      0.97      1718
 Бытовая электрони

In [39]:
accuracy_score(y_test, predicted2a.argmax(axis=1))

0.88292613172086942

---

#### Сделаем предсказание на тестовых данных, обучившись на всей выборке

In [None]:
dtrain_hs = lgb.Dataset(train_hs, train.category_id, max_bin = 2)

In [None]:
model = lgb.train(best_params, dtrain_hs, num_boost_round=153, verbose_eval=None)
predicted_hs = model.predict(test_hs)

In [None]:
output = test[['item_id']].copy()
output['category_id'] = predicted_hs.argmax(axis=1)

output.to_csv('test_scoring.csv', index=False, encoding='utf-8')