In [398]:
import pandas as pd
import numpy as np

In [399]:
train_df = pd.read_csv('data/HeadHunter_train.csv')
test_df = pd.read_csv('data/HeadHunter_test.csv')

In [400]:
train_df.drop(['positive', 'negative'], axis=1).to_csv('data/clean_train.csv')
test_df.drop(['positive', 'negative'], axis=1).to_csv('data/clean_test.csv')

In [476]:
train_df['city_cat'] = train_df.position.apply(lambda text: text if text in popular_cities else 'Другое')
test_df['city_cat'] = test_df.position.apply(lambda text: text if text in popular_cities else 'Другое')

In [477]:
all_sentences = (
    train_df['positive']
    .append(train_df['negative'])
    .append(test_df['positive'])
    .append(test_df['negative'])
    .fillna('None')
)

In [None]:
sentences = normalize_sentences(all_sentences)

In [405]:
sentences = [" ".join([word for word in text if len(word) > 3]) for text in sentences]

In [406]:
train_df['normal_positive'] = sentences[:train_df.shape[0]]
train_df['normal_negative'] = sentences[train_df.shape[0]:2*train_df.shape[0]]
test_df['normal_positive'] = sentences[2*train_df.shape[0]:2*train_df.shape[0]+test_df.shape[0]]
test_df['normal_negative'] = sentences[2*train_df.shape[0]+test_df.shape[0]:]

In [407]:
train_df['target'] = train_df['target'].str.split(',')

In [408]:
train_df = train_df.explode('target')

In [409]:
train_df.shape

(53753, 14)

In [527]:
all_positions = train_df.append(test_df)['position']
clear_positions = normalize_sentences(all_positions.fillna('None'))
clear_positions = [" ".join(text) for text in clear_positions]

100%|████████████████████████████████| 104404/104404 [00:02<00:00, 46050.12it/s]
100%|█████████████████████████████████████| 8810/8810 [00:02<00:00, 3598.95it/s]
100%|███████████████████████████████| 104404/104404 [00:00<00:00, 662335.99it/s]


In [529]:
train_df['normal_position'] = clear_positions[:train_df.shape[0]]
test_df['normal_position'] = clear_positions[train_df.shape[0]:]

In [576]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, accuracy_score

In [577]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [578]:
target = 'target'
exception = ['review_id', target]
real_columns = [col for col in df.columns if col not in exception and not isinstance(df.iloc[0][col], str)]
cat_columns = ['city_cat']
text_columns = ['normal_positive', 'normal_negative']

In [579]:
class PandasSimpleImputer(SimpleImputer):
    """A wrapper around `SimpleImputer` to return data frames with columns.
    """

    def fit(self, X, y=None):
        self.columns = X.columns
        return super().fit(X, y)

    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns=self.columns)

In [580]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown='ignore'))]
)

def get_text_transformer(transformer):
    return Pipeline(
        steps=[
            ("imputer", PandasSimpleImputer(strategy="constant", fill_value='None')),
            ('transformer', ColumnTransformer([
                ('position', TfidfVectorizer(max_features=1000, ngram_range=(1, 2)), 'normal_position'),
                *[
                (f"text_{col}", transformer, col)
                for col in text_columns
            ]]))
        ],
    )

In [581]:
def pipe_with_text_transformer(text_transformer):
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, real_columns),
            ("cat", categorical_transformer, cat_columns),
            ("text", text_transformer, text_columns + ['normal_position']),
        ],
    )

    pipe = Pipeline(steps=[
        ('transformer', preprocessor),
        ('model', RandomForestClassifier())
    ])
    
    return pipe

In [582]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop(target, axis=1), 
    train_df[target], 
    test_size=0.3, 
    random_state=42
)

In [583]:
tfidf_text_transformer = get_text_transformer(TfidfVectorizer())
tfidf_pipe = pipe_with_text_transformer(tfidf_text_transformer)

In [584]:
tfidf_pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salary_rating',
                                                   'team_rating',
                                                   'managment_rating',
                                                   'career_rating',
                                                   'workplace_rating',
                                                   'rest_recovery_rating']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
          

In [585]:
tfidf_pipe.named_steps['transformer'].transformers_[2][1].named_steps['transformer'].transformers_[0][1].get_feature_names()



['абонент',
 'автокредитование',
 'автоматизация',
 'автоматизированный',
 'автоматика',
 'автомеханик',
 'автомобиль',
 'автор',
 'агент',
 'агент по',
 'агентский',
 'адаптация',
 'административно',
 'административный',
 'администратор',
 'администратор кассир',
 'администратор магазин',
 'администратор торговый',
 'администрирование',
 'азс',
 'актив',
 'активный',
 'активный продажа',
 'анализ',
 'аналитик',
 'аналитик по',
 'английский',
 'английский язык',
 'андеррайтер',
 'андеррайтинг',
 'аппаратчик',
 'аптека',
 'аренда',
 'архитектор',
 'асессор',
 'ассистент',
 'ассистент менеджер',
 'аудит',
 'аудитор',
 'база',
 'база данные',
 'банк',
 'банковский',
 'банковский продукт',
 'барист',
 'бармен',
 'без',
 'безопасность',
 'бизнес',
 'бизнес аналитик',
 'бизнес процесс',
 'бизнес тренер',
 'бортпроводник',
 'бренд',
 'бренд менеджер',
 'бригада',
 'бригада ресторан',
 'бригадир',
 'бурение',
 'бурильщик',
 'буровой',
 'бухгалтер',
 'бухгалтер кассир',
 'бухгалтер по',
 'быть'

In [586]:
y_pred = tfidf_pipe.predict(X_test)

print('f1_score:', f1_score(y_test, y_pred, average='weighted')) # 0.72555
print('accuracy_score:', accuracy_score(y_test, y_pred))

f1_score: 0.7134912986518479
accuracy_score: 0.7371325809252139


In [562]:
def sumbit(pipe):
    sub_df = pd.read_csv('data/HeadHunter_sample_submit.csv')
    submittions = pipe.predict(test_df)
    sub_df['target'] = submittions
    return sub_df.to_csv('submittion.csv', index=False)

In [563]:
sumbit(tfidf_pipe)

In [148]:
import joblib

In [429]:
# joblib.dump(tfidf_pipe, 'models/tfidf_pipe_0.725.pickle')

['models/tfidf_pipe_0.725.pickle']

In [205]:
import gensim

In [206]:
import pymorphy2

In [207]:
morph = pymorphy2.MorphAnalyzer()

In [362]:
from nltk.tokenize import word_tokenize

In [363]:
import urllib.request
import gensim
from gensim.models import word2vec

import nltk.data
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, RegexpTokenizer 

import pandas as pd
from tqdm import tqdm 

import pymorphy2

tqdm.pandas()

In [364]:
from sklearn.base import BaseEstimator, TransformerMixin

In [365]:
from tqdm import tqdm

In [366]:
import importlib
import normalizer
normalizer = importlib.reload(normalizer)
from normalizer import normalize_sentences

In [367]:
class HHWord2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.emb_size = model.vector_size
    
    def text2vec(self, text):
        vector = np.array([.0 for _ in range(self.emb_size)])
        count = 0

        for word in text.split():
            if word in self.model.wv:
                vector += self.model.wv[word]
                count += 1

        if count != 0:
            vector /= count

        return vector

    def fit(self,X, y=None):
        return self
    
    def transform(self, X, y=None): 
        vectors = np.zeros((X.shape[0], self.emb_size))
        
        for i, text in tqdm(enumerate(X), total=len(X)):
            vectors[i, :] = self.text2vec(text)
            
        return vectors

In [501]:
model = gensim.models.Word2Vec.load('hh_word2vec.pickle')

In [559]:
word2vec_transformer = get_text_transformer(HHWord2VecTransformer(model))
word2vec_pipe = pipe_with_text_transformer(word2vec_transformer)

In [560]:
word2vec_pipe.fit(X_train, y_train)

100%|███████████████████████████████████| 37627/37627 [00:04<00:00, 8634.32it/s]
100%|██████████████████████████████████| 37627/37627 [00:03<00:00, 10823.15it/s]


Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salary_rating',
                                                   'team_rating',
                                                   'managment_rating',
                                                   'career_rating',
                                                   'workplace_rating',
                                                   'rest_recovery_rating']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
          

In [561]:
y_pred = word2vec_pipe.predict(X_test)

print('f1_score:', f1_score(y_test, y_pred, average='weighted')) # 0.729
print('accuracy_score:', accuracy_score(y_test, y_pred))

100%|███████████████████████████████████| 16126/16126 [00:01<00:00, 8844.67it/s]
100%|██████████████████████████████████| 16126/16126 [00:01<00:00, 11574.01it/s]


f1_score: 0.7180137753359492
accuracy_score: 0.7386828723800074


In [377]:
sumbit(word2vec_pipe)

100%|██████████████████████████████████| 50651/50651 [00:04<00:00, 11125.47it/s]
100%|██████████████████████████████████| 50651/50651 [00:04<00:00, 12274.51it/s]


In [430]:
# joblib.dump(word2vec_pipe, 'models/word2vec_pipe_0.729.pickle')

['models/word2vec_pipe_0.729.pickle']