In [19]:
import json
import re
from pprint import pprint

import numpy as np
import pandas as pd
from IPython.core.display import HTML, display
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import confusion_matrix
import spacy

nlp = spacy.load('ru_core_news_lg')



  from IPython.core.display import HTML, display


In [20]:


class FullDescriptionCreator(BaseEstimator, TransformerMixin):
    patt = re.compile("[^\s\w]")

    def __init__(self, responsibilities):
        self.responsibilities = responsibilities

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["responsibilities"] = self.responsibilities
        X["full_description"] = (
            X["name"] + " " + X["responsibilities"].fillna("")
        ).map(str.lower)
        X.loc[:, "full_description"] = X["full_description"].str.replace(
            self.patt, " ", regex=True
        )
        return X


train = pd.read_csv("./train.csv", index_col="index")
print(f"{train.shape}")
print("Типы столбцов: ")
display(train.dtypes)
print("Фрагмент данных: ")
display(train.head())


(30000, 3)
Типы столбцов: 


name           object
description    object
target          int64
dtype: object

Фрагмент данных: 


Unnamed: 0_level_0,name,description,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
324865089,продавец кассир,<strong>Обязанности:</strong> <ul> <li>работа ...,-1
169467135,продавец мила (шевченко 17),<p><strong>Магазин МИЛА по адресу б-р Шевченко...,-1
169939030,кассир в пиццерию г витебск,"<p><strong>Устал искать работу? Может, хочешь ...",5223
31956044,начальник строительного участка,<p>Компании &quot;Артель-Строй&quot; на постоя...,-1
36781653,продавец кассир (санкт петербург бухарестская 31),<p>Обязанности:</p><p> •Обслуживание покупа...,-1


In [21]:

train = train.query("target != -1")
print(f"{train.shape=}")

train.at[169939030, "description"]

with open(
    "./vacancy_descriptions/2_parsed.json", "r", encoding="utf8"
) as fp:
    descriptions = json.load(fp)
    

responsibilities = pd.Series({
    description["ID"]: r[0]
    if (r := description["Content"].get("Обязанности")) is not None
    else None
    for description in descriptions
}, name="responsibilities")


train["responsibilities"] = responsibilities

train["full_description"] = (
    train["name"] + " " + train["responsibilities"].fillna("")
).map(str.lower)

patt = re.compile("[^\w\s]")
train.loc[:, "full_description"] = train["full_description"].str.replace(
    patt, " ", regex=True
)

X_train_raw, y_train = train["full_description"], train["target"]

X_train_raw


docs = [nlp(text) for text in X_train_raw]
train_x_word_vectors = [x.vector for x in docs]

train_x_word_vectors[0]


train.shape=(15000, 3)


array([ 0.03560386, -0.21459208, -0.12009188,  0.15853915,  0.06140671,
        0.09399105,  0.07320838,  0.01222286,  0.19687626, -0.01116902,
       -0.03156796, -0.04528657, -0.00573984,  0.0190233 ,  0.05508011,
        0.10389701, -0.00242258, -0.0010704 , -0.04090424,  0.20696554,
       -0.06228608, -0.24492645,  0.05713023,  0.05338564,  0.06297158,
       -0.04240432,  0.06613895,  0.16513336, -0.07516245,  0.10809041,
       -0.01036515, -0.15860757,  0.53712296, -0.00108516,  0.00130198,
        0.04367797,  0.12001096,  0.17025046, -0.16244456, -0.19835089,
       -0.05467543,  0.05016365,  0.06995301, -0.0618767 , -0.05789093,
        0.03491769, -0.13473155,  0.03035766,  0.03601493,  0.03541508,
       -0.07193881, -0.06211124, -0.07721085,  0.20575714, -0.15518436,
        0.0675396 ,  0.059786  , -0.00130114,  0.06848715, -0.11823181,
       -0.00220902, -0.12369963, -0.13709602,  0.04950109, -0.13065036,
        0.20665541, -0.05367049, -0.05710496,  0.07028902, -0.09

In [22]:
clf_svm_wv = svm.SVC(kernel='linear')
clf_svm_wv.fit(train_x_word_vectors, y_train)

In [23]:
print(clf_svm_wv.predict(train_x_word_vectors[4].reshape(1,-1)))
y_train

[6121]


index
169939030    5223
169293782    5223
291073919    7212
179260831    5223
39608878     6121
             ... 
110663394    3341
30812168     3341
50002587     3341
183546781    3341
39324585     6121
Name: target, Length: 15000, dtype: int64

In [24]:
X_test_raw = pd.read_csv("./test.csv")
train = X_test_raw
print(f"{train.shape=}")
    

responsibilities = pd.Series({
    description["ID"]: r[0]
    if (r := description["Content"].get("Обязанности")) is not None
    else None
    for description in descriptions
}, name="responsibilities")


train["responsibilities"] = responsibilities

train["full_description"] = (
    train["name"] + " " + train["responsibilities"].fillna("")
).map(str.lower)

patt = re.compile("[^\w\s]")
train.loc[:, "full_description"] = train["full_description"].str.replace(
    patt, " ", regex=True
)

X_train = train["full_description"]


docs = [nlp(text) for text in X_train]
train_x_word_vectors = [x.vector for x in docs]



train.shape=(6973, 3)


In [25]:
y_pred = clf_svm_wv.predict(train_x_word_vectors)



submission = X_test_raw[["index"]].assign(target=y_pred)
display(submission.head(4))
submission.to_csv("demo_submission.csv", index=False)
submission.shape


Unnamed: 0,index,target
0,28357560,5223
1,114041896,2433
2,78645675,2433
3,34898406,9333


(6973, 2)

In [None]:

tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)