## Описание данных:
* train.csv - данные для обучения
* test.csv - данные для подготовки самбита и проверки
* sampleSubmission.csv - пример корректного но бесполезного сабмита
* other.csv - необязательные данные для доп.статистик и прочих извращений (например обучение word2vec-а)

## Описание полей:
* id - внутренний идетификатор
* name - название вакансии
* description - текст вакансии
* target - класс заинтересованности

# Imports

In [2]:
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from pymystem3 import Mystem

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [20]:
from sklearn.metrics import classification_report, make_scorer, roc_auc_score

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Data analysis

In [5]:
train = pd.read_csv('train.csv', sep='\t', index_col='id')

test = pd.read_csv('test.csv', sep='\t', index_col='id')

In [6]:
train.head()

Unnamed: 0_level_0,name,description,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 3 columns):
name           200000 non-null object
description    200000 non-null object
target         200000 non-null int64
dtypes: int64(1), object(2)
memory usage: 6.1+ MB


In [8]:
train['target'].value_counts()

0    106436
1     93564
Name: target, dtype: int64

In [9]:
test.head()

Unnamed: 0_level_0,name,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1
200000,Дизайнер-консультант мебели,<p><strong>Обязанности:</strong></p> <ul> <li>...
200001,Продавец-консультант (ТЦ на Пушкина),<p><strong>Обязанности</strong>:</p> <p>∙ конс...
200002,Менеджер по продажам,<p>Торговый Дом «Форт» это ведущая компания Пе...
200003,Продавец-консультант в магазин одежды (ТЦ Волн...,<p><strong>Требуются продавцы консультанты в м...
200004,Специалист по охране труда,<strong>Обязанности:</strong> <ul> <li> <p>осу...


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170179 entries, 200000 to 370178
Data columns (total 2 columns):
name           170179 non-null object
description    170179 non-null object
dtypes: object(2)
memory usage: 3.9+ MB


# Preprocessing

In [11]:
def preprocessing(df, target_label=None):
    df['description'] = df['description'].map(lambda t: BeautifulSoup(t, 'html.parser').text)
    df['text'] = df['name'] + ' ' + df['description']
    
    df = df.drop(['description', 'name'], axis=1)
    
    if target_label is not None:
        y = df[target_label]
        df = df.drop(target_label, axis=1)
        
        return df['text'], y
    
    return df['text']

In [12]:
%%time

X_train, y_train = train.pipe(preprocessing, target_label='target')

X_test = test.pipe(preprocessing)

CPU times: user 6min 5s, sys: 1.16 s, total: 6min 6s
Wall time: 6min 7s


In [13]:
X_train.head()

id
0    Заведующий отделом/секцией в магазин YORK (Уру...
1    Наладчик станков и манипуляторов с ПУ Обязанно...
2    Разработчик С++ (Криптограф) Требования:  Опыт...
3    Фрезеровщик Условия:  На работу вахтовым метод...
4    Мерчендайзер/продавец-консультант Компания Пал...
Name: text, dtype: object

In [14]:
y_train.head()

id
0    1
1    0
2    0
3    0
4    1
Name: target, dtype: int64

In [15]:
X_test.head()

id
200000    Дизайнер-консультант мебели Обязанности:  Рабо...
200001    Продавец-консультант (ТЦ на Пушкина) Обязаннос...
200002    Менеджер по продажам Торговый Дом «Форт» это в...
200003    Продавец-консультант в магазин одежды (ТЦ Волн...
200004    Специалист по охране труда Обязанности:   осущ...
Name: text, dtype: object

# Machine learning

In [24]:
def prediction_to_csv(y_hat):
    from datetime import datetime
    
    df_predict = pd.DataFrame()
    
    df_predict['id'] = X_test.index
    df_predict['target'] = y_hat
    
    today = datetime.today().strftime('%Y-%m-%d_%H-%M')
    df_predict.to_csv(path_or_buf='./submission/submission_{today}.cvs'.format(today=today), index=False, sep=',')

In [25]:
mystem = Mystem()

def tokenizer(text):
    lemm = mystem.lemmatize(text)
    
    return [x for x in lemm if re.match(r'[а-яёa-z]{2,}', x)]

In [26]:
def print_grid_info(grid):
    print('-'  * 10, 'BEST PARAMS', '-' * 10)
    print(grid.best_params_)

    mean_scores = grid.cv_results_['mean_test_score']
    std_scores = grid.cv_results_['std_test_score']

    print('-'  * 10, 'ROC_AUC SCORES', '-' * 10)
    print(mean_scores)
    print('-'  * 10, 'STD SCORES', '-' * 10)
    print(std_scores)
    
    print('-'  * 10, 'MEAN OF ROC_AUC AND STD SCORES', '-' * 10)
    print(mean_scores.mean(), mean_scores.std())
    
    print('-'  * 10, 'BEST USING', '-' * 10)
    print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
    
    print('-'  * 10, 'CV RESULTS', '-' * 10)
    print(grid.cv_results_)

In [27]:
# cv = CountVectorizer(tokenizer=tokenizer, analyzer='word')

# %%time

# z = cv.fit_transform(X['text'])

# cv.vocabulary_

In [28]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

## LogisticRegression

In [97]:
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=False)),
    ('lr', LogisticRegression(class_weight='balanced', tol=1e-5))
])

In [98]:
param_grid_lr = {
#     'lr__C': [0.01, 0.1, 1, 10, 100],
    'lr__C': np.linspace(0.002, 0.003, 5),
    'lr__penalty': ['l2', 'l1'],
    'lr__solver': ['liblinear']
}

In [99]:
lr_grid = GridSearchCV(pipe_lr, param_grid=param_grid_lr, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=30)

In [100]:
%%time

lr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] lr__penalty=l2, lr__solver=liblinear ............................
[CV] lr__penalty=l2, lr__solver=liblinear ............................
[CV] lr__penalty=l2, lr__solver=liblinear ............................
[CV] lr__penalty=l2, lr__solver=liblinear ............................
[CV] lr__penalty=l2, lr__solver=liblinear ............................
[CV] lr__penalty=l1, lr__solver=liblinear ............................
[CV] lr__penalty=l1, lr__solver=liblinear ............................
[CV] lr__penalty=l1, lr__solver=liblinear ............................
[CV]  lr__penalty=l2, lr__solver=liblinear, score=0.9904725068592326, total= 1.3min


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.2min


[CV] lr__penalty=l1, lr__solver=liblinear ............................
[CV]  lr__penalty=l2, lr__solver=liblinear, score=0.9908278450672655, total= 1.4min


[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:  2.3min remaining:  9.3min


[CV]  lr__penalty=l1, lr__solver=liblinear, score=0.982486367988637, total= 1.2min


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  2.4min remaining:  5.5min


[CV] lr__penalty=l1, lr__solver=liblinear ............................
[CV]  lr__penalty=l1, lr__solver=liblinear, score=0.9833260544838484, total= 1.2min


[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  2.5min remaining:  3.7min


[CV]  lr__penalty=l2, lr__solver=liblinear, score=0.9913886586026526, total= 1.6min


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  2.5min remaining:  2.5min


[CV]  lr__penalty=l1, lr__solver=liblinear, score=0.9842141909380746, total= 1.2min


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.6min remaining:  1.7min


[CV]  lr__penalty=l2, lr__solver=liblinear, score=0.9919544276584656, total= 1.6min


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.6min remaining:  1.1min


[CV]  lr__penalty=l2, lr__solver=liblinear, score=0.9908637374859391, total= 1.6min


[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:  2.6min remaining:   39.3s


[CV]  lr__penalty=l1, lr__solver=liblinear, score=0.9849134716051229, total=  39.7s
[CV]  lr__penalty=l1, lr__solver=liblinear, score=0.9836803204793351, total=  35.7s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.4min finished


CPU times: user 52.6 s, sys: 10.2 s, total: 1min 2s
Wall time: 4min 13s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...lty='l2', random_state=None,
          solver='liblinear', tol=1e-05, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'lr__penalty': ['l2', 'l1'], 'lr__solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=30)

In [101]:
print_grid_info(lr_grid)

---------- BEST PARAMS ----------
{'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
---------- ROC_AUC SCORES ----------
[0.99110143 0.98372408]
---------- STD SCORES ----------
[0.00051711 0.00081819]
---------- MEAN OF ROC_AUC AND STD SCORES ----------
0.9874127541538995 0.0036886790246584145
---------- BEST USING ----------
Best: 0.991101 using {'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
---------- CV RESULTS ----------
{'mean_fit_time': array([78.36156368, 49.9887898 ]), 'std_fit_time': array([ 9.10936183, 14.85854341]), 'mean_score_time': array([12.15409956,  9.47857294]), 'std_score_time': array([2.03680191, 3.3923796 ]), 'param_lr__penalty': masked_array(data=['l2', 'l1'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_lr__solver': masked_array(data=['liblinear', 'liblinear'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'lr__penalty': 'l2', 'lr__solver': 'liblinear'}, {'lr__penalt

In [86]:
estimator = lr_grid.best_estimator_

In [87]:
y_hat = estimator.predict(X_test)

In [88]:
prediction_to_csv(y_hat)

{'lr__C': 0.0022500000000000003, 'lr__solver': 'liblinear'}

## RandomForestClassifier

In [133]:
pipe_rfc = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=False)),
    ('rfc', RandomForestClassifier())
])

In [134]:
linspace = np.linspace(5, 50, 5, dtype=np.int)

param_grid_rfc = {
    'rfc__max_depth': np.append(linspace, None),
    'rfc__n_estimators': [10, 100],
    'rfc__criterion': ['gini', 'entropy']
#     'rfc__bootstrap': [True, False]
}

In [135]:
rfc_grid = GridSearchCV(pipe_rfc, param_grid=param_grid_rfc, scoring='roc_auc', cv=cv, n_jobs=-1)

In [None]:
%%time

rfc_grid.fit(X_train, y_train)

In [None]:
print_grid_info(rfc_grid)

In [None]:
%%time

cvs = cross_val_score(pipe_rfc, X_train, y_train, cv=cv, n_jobs=-1)

In [121]:
cvs

array([0.9558261 , 0.95295   , 0.9523    , 0.954575  , 0.95337383])

## MultinominalNB

In [32]:
pipe_mnb = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=False)),
    ('mnb', MultinomialNB())
])

In [41]:
param_grid_mnb = {
    'mnb__alpha': [0.001, 1000]
}

In [42]:
mnb_grid = GridSearchCV(pipe_mnb, param_grid=param_grid_mnb, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=30)

In [43]:
%%time

mnb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] mnb__alpha=0.001 ................................................
[CV] mnb__alpha=0.001 ................................................
[CV] mnb__alpha=0.001 ................................................
[CV] mnb__alpha=0.001 ................................................
[CV] mnb__alpha=0.001 ................................................
[CV] mnb__alpha=1000 .................................................
[CV] mnb__alpha=1000 .................................................
[CV] mnb__alpha=1000 .................................................
[CV] ....... mnb__alpha=0.001, score=0.9540611235584053, total=  39.2s


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.4min


[CV] mnb__alpha=1000 .................................................
[CV] ........ mnb__alpha=0.001, score=0.952329240328685, total=  41.9s


[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:  1.5min remaining:  6.1min


[CV] ....... mnb__alpha=0.001, score=0.9527873084030808, total=  44.0s


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.6min remaining:  3.8min


[CV] mnb__alpha=1000 .................................................
[CV] ....... mnb__alpha=0.001, score=0.9536405214923596, total=  46.3s


[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  1.7min remaining:  2.5min


[CV] ....... mnb__alpha=0.001, score=0.9545038226627829, total=  47.5s


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.8min remaining:  1.8min


[CV] ........ mnb__alpha=1000, score=0.9583594100952475, total=  49.4s


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.8min remaining:  1.2min


[CV] ........ mnb__alpha=1000, score=0.9573750835243052, total=  50.2s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.9min remaining:   48.6s


[CV] ........ mnb__alpha=1000, score=0.9584505770596845, total=  50.4s


[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:  1.9min remaining:   28.9s


[CV] ........ mnb__alpha=1000, score=0.9597586135373657, total=  34.4s
[CV] ........ mnb__alpha=1000, score=0.9592796757592891, total=  30.9s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.5min finished


CPU times: user 30.7 s, sys: 6.9 s, total: 37.6 s
Wall time: 2min 56s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...False,
         use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'mnb__alpha': [0.001, 1000]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc',
       verbose=30)

In [44]:
print_grid_info(mnb_grid)

---------- BEST PARAMS ----------
{'mnb__alpha': 1000}
---------- ROC_AUC SCORES ----------
[0.9534644  0.95864467]
---------- STD SCORES ----------
[0.00080181 0.00082158]
---------- MEAN OF ROC_AUC AND STD SCORES ----------
0.9560545342347084 0.0025901331591415278
---------- BEST USING ----------
Best: 0.958645 using {'mnb__alpha': 1000}
---------- CV RESULTS ----------
{'mean_fit_time': array([33.57744107, 34.65010719]), 'std_fit_time': array([2.91857507, 6.32432709]), 'mean_score_time': array([10.22018523,  8.41764712]), 'std_score_time': array([0.12901067, 2.2704477 ]), 'param_mnb__alpha': masked_array(data=[0.001, 1000],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'mnb__alpha': 0.001}, {'mnb__alpha': 1000}], 'split0_test_score': array([0.95406112, 0.95835941]), 'split1_test_score': array([0.95232924, 0.95737508]), 'split2_test_score': array([0.95278731, 0.95845058]), 'split3_test_score': array([0.95364052, 0.95975861]), 'split4_

In [30]:
cvs = cross_val_score(pipe_mnb, X_train, y_train, cv=cv, n_jobs=-1)

In [31]:
cvs

array([0.93977651, 0.937725  , 0.93965   , 0.939275  , 0.93907348])

## Тесты (УДАЛИТЬ)

In [None]:
estimator = grid.best_estimator_

print(estimator)

In [None]:
# grid.get_params()

In [None]:
# grid.best_params_

In [None]:
# %%time

# cv = StratifiedKFold(n_splits=5, shuffle=True)
# scores = cross_val_score(lr, X['text'], y, cv=cv, scoring=make_scorer(roc_auc_score))

In [None]:
# scores.mean(), scores.std()

In [None]:
# scores

In [None]:
# %%time

# cv = StratifiedKFold(n_splits=5, shuffle=True)
# scores = cross_val_score(crf, X['text'], y, cv=cv, scoring=make_scorer(roc_auc_score))

In [None]:
# scores.mean(), scores.std()

In [None]:
!telegram-send "Preprocessing completed!"

In [109]:
!telegram-send "LogisticRegression fit completed!"

[0m

In [None]:
!telegram-send "RandomForestClassifier fit completed!"