In [4]:
import pandas as pd
import numpy as np

In [5]:
train_df = pd.read_csv('data/HeadHunter_train.csv', index_col=0)
test_df = pd.read_csv('data/HeadHunter_test.csv', index_col=0)

In [6]:
popular_cities = train_df.append(test_df).city.value_counts()[:50].index

In [7]:
train_df['city_cat'] = train_df.position.apply(lambda text: text if text in popular_cities else 'Другое')
test_df['city_cat'] = test_df.position.apply(lambda text: text if text in popular_cities else 'Другое')

In [9]:
all_sentences = (
    train_df['positive']
    .append(train_df['negative'])
    .append(test_df['positive'])
    .append(test_df['negative'])
    .fillna('None')
)

In [10]:
from normalizer import normalize_sentences

In [15]:
sentences = normalize_sentences(all_sentences)

100%|████████████████████████████████| 203054/203054 [00:15<00:00, 13529.43it/s]
100%|███████████████████████████████████| 92678/92678 [00:21<00:00, 4318.04it/s]
100%|███████████████████████████████| 203054/203054 [00:01<00:00, 165223.99it/s]


In [16]:
sentences = [" ".join([word for word in text if len(word) > 3]) for text in sentences]

In [17]:
train_df['normal_positive'] = sentences[:train_df.shape[0]]
train_df['normal_negative'] = sentences[train_df.shape[0]:2*train_df.shape[0]]
test_df['normal_positive'] = sentences[2*train_df.shape[0]:2*train_df.shape[0]+test_df.shape[0]]
test_df['normal_negative'] = sentences[2*train_df.shape[0]+test_df.shape[0]:]

In [18]:
train_df['target'] = train_df['target'].str.split(',')

In [19]:
train_df = train_df.explode('target')

In [20]:
train_df.shape

(53753, 14)

In [21]:
all_positions = train_df.append(test_df)['position']
clear_positions = normalize_sentences(all_positions.fillna('None'))
clear_positions = [" ".join(text) for text in clear_positions]

100%|████████████████████████████████| 104404/104404 [00:01<00:00, 55191.00it/s]
100%|███████████████████████████████████| 10096/10096 [00:02<00:00, 4139.59it/s]
100%|███████████████████████████████| 104404/104404 [00:00<00:00, 400951.25it/s]


In [22]:
train_df['normal_position'] = clear_positions[:train_df.shape[0]]
test_df['normal_position'] = clear_positions[train_df.shape[0]:]

In [23]:
train_df.drop(['positive', 'negative', 'position'], axis=1).to_csv('data/clean_train.csv', index=False)
test_df.drop(['positive', 'negative', 'position'], axis=1).to_csv('data/clean_test.csv', index=False)

In [15]:
train_df = pd.read_csv('data/clean_train.csv')
test_df = pd.read_csv('data/clean_test.csv')

In [30]:
train_df['positive_len'] = train_df.normal_positive.str.len()
train_df['negative_len'] = train_df.normal_negative.str.len()
test_df['positive_len'] = test_df.normal_positive.str.len()
test_df['negative_len'] = test_df.normal_negative.str.len()
train_df['position_len'] = train_df.normal_position.str.len()
test_df['position_len'] = test_df.normal_position.str.len()

In [58]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, accuracy_score

In [32]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [135]:
target = 'target'
exception = ['review_id', target]
real_columns = [col for col in train_df.columns if col not in exception and not isinstance(train_df.iloc[0][col], str)]
cat_columns = ['city']
text_columns = ['normal_positive', 'normal_negative']

In [136]:
class PandasSimpleImputer(SimpleImputer):
    """A wrapper around `SimpleImputer` to return data frames with columns.
    """

    def fit(self, X, y=None):
        self.columns = X.columns
        return super().fit(X, y)

    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns=self.columns)

In [137]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("poly",  PolynomialFeatures(2)),
        ("scaler", StandardScaler())
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown='ignore'))
    ]
)

def get_text_transformer(transformer):
    return Pipeline(
        steps=[
            ("imputer", PandasSimpleImputer(strategy="constant", fill_value='None')),
            ('transformer', ColumnTransformer([
                ('position', TfidfVectorizer(max_features=1000, ngram_range=(1, 2)), 'normal_position'),
                *[
                (f"text_{col}", transformer, col)
                for col in text_columns
            ]]))
        ],
    )

In [138]:
from sklearn.svm import SVC, NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
import joblib

In [139]:
# Initializing Support Vector classifier
clf_svc = SVC(C = 50, degree = 1, gamma = "auto", kernel = "rbf", probability = True)

# Initializing Multi-layer perceptron  classifier
clf_mlp = MLPClassifier(activation = "relu", alpha = 0.1, hidden_layer_sizes = (10,10,10),
                            learning_rate = "constant", max_iter = 2000, random_state = 1000)

# Initializing Random Forest classifier
clf_rfc = RandomForestClassifier(n_estimators = 500, criterion = "gini", max_depth = 10,
                                     max_features = "auto", min_samples_leaf = 0.005,
                                     min_samples_split = 0.005, n_jobs = -1, random_state = 1000)

classifiers = [('svc', clf_svc),
               ('mlp', clf_mlp),                             
               ('rfc', clf_rfc)]

In [140]:
stacked_model = StackingClassifier(
    estimators=classifiers, 
    final_estimator=LogisticRegression(),
    stack_method='auto',
    n_jobs=-1,
    passthrough=False
)

In [141]:
def pipe_with_text_transformer(text_transformer, clf_model):
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, real_columns),
            ("cat", categorical_transformer, cat_columns),
            ("text", text_transformer, text_columns + ['normal_position']),
        ],
    )
    
    pipe = Pipeline(steps=[
        ('transformer', preprocessor),
        ('model', clf_model)
    ])
    
    return pipe

In [177]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df.drop(target, axis=1), 
    train_df[target], 
    test_size=0.3, 
    random_state=42
)
is_high_target = y_train.isin([0, 8, 1, 3, 6, 7, 5])
X_train = X_train[is_high_target]
y_train = y_train[is_high_target]

In [178]:
tfidf_text_transformer = get_text_transformer(TfidfVectorizer(max_features=1000))
tfidf_pipe = pipe_with_text_transformer(tfidf_text_transformer, stacked_model)

In [179]:
%%time
tfidf_pipe.fit(X_train, y_train)

CPU times: user 7.46 s, sys: 1.18 s, total: 8.64 s
Wall time: 46min 54s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salary_rating',
                                                   'team_rating',
                                                   'managment_rating',
                                                   'career_rating',
                                                   'workplace_rating',
                                                   'rest_

In [180]:
# get the position key words
# (
#     tfidf_pipe
#     .named_steps['transformer']
#     .transformers_[2][1]
#     .named_steps['transformer']
#     .transformers_[0][1]
#     .get_feature_names()
# )

In [181]:
def test_pipe(pipe):
    y_pred = pipe.predict(X_test)
    
    print('f1_score:', f1_score(y_test, y_pred, average='weighted'))
    print('accuracy:', accuracy_score(y_test, y_pred))

In [182]:
def sumbit(pipe):
    sub_df = pd.read_csv('data/HeadHunter_sample_submit.csv')
    submittions = pipe.predict(test_df)
    sub_df['target'] = submittions
    return sub_df.to_csv('submittion.csv', index=False)

In [183]:
test_pipe(tfidf_pipe)  # 0.7587

f1_score: 0.7611512647683535
accuracy: 0.7887262805407417


In [57]:
# sumbit(tfidf_pipe)

In [87]:
# joblib.dump(tfidf_pipe, 'models/tfidf_pipe_0.7335.pickle')

NameError: name 'joblib' is not defined

In [43]:
import gensim
from gensim.models import word2vec

In [44]:
from sklearn.base import BaseEstimator, TransformerMixin

In [45]:
from tqdm import tqdm

In [46]:
class HHWord2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.emb_size = model.vector_size
    
    def text2vec(self, text):
        vector = np.array([.0 for _ in range(self.emb_size)])
        count = 0

        for word in text.split():
            if word in self.model.wv:
                vector += self.model.wv[word]
                count += 1

        if count != 0:
            vector /= count

        return vector

    def fit(self,X, y=None):
        return self
    
    def transform(self, X, y=None): 
        vectors = np.zeros((X.shape[0], self.emb_size))
        
        for i, text in tqdm(enumerate(X), total=len(X)):
            vectors[i, :] = self.text2vec(text)
            
        return vectors

In [51]:
model = gensim.models.Word2Vec.load('hh_word2vec.pickle')

In [52]:
word2vec_transformer = get_text_transformer(HHWord2VecTransformer(model))
word2vec_pipe = pipe_with_text_transformer(word2vec_transformer)

In [53]:
%%time
word2vec_pipe.fit(X_train, y_train);

100%|███████████████████████████████████| 37627/37627 [00:04<00:00, 8445.65it/s]
100%|██████████████████████████████████| 37627/37627 [00:03<00:00, 10438.37it/s]


CPU times: user 2min 5s, sys: 1.17 s, total: 2min 6s
Wall time: 2min 6s


Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['salary_rating',
                                                   'team_rating',
                                                   'managment_rating',
                                                   'career_rating',
                                                   'workplace_rating',
                                                   'rest_recovery_rating',
                                                   'positive_len',
                                                   'negative_len',
             

In [54]:
y_pred = word2vec_pipe.predict(X_test)

print('f1_score:', f1_score(y_test, y_pred, average='weighted')) # 0.729
print('accuracy_score:', accuracy_score(y_test, y_pred))

100%|███████████████████████████████████| 16126/16126 [00:01<00:00, 8495.76it/s]
100%|██████████████████████████████████| 16126/16126 [00:01<00:00, 10637.71it/s]


f1_score: 0.7286120187241485
accuracy_score: 0.7499689941709041


In [377]:
sumbit(word2vec_pipe)

100%|██████████████████████████████████| 50651/50651 [00:04<00:00, 11125.47it/s]
100%|██████████████████████████████████| 50651/50651 [00:04<00:00, 12274.51it/s]


In [430]:
# joblib.dump(word2vec_pipe, 'models/word2vec_pipe_0.729.pickle')

['models/word2vec_pipe_0.729.pickle']