# Вспомогательные функции

In [2]:
import pandas as pd

In [3]:
import pickle

def save_model(model, name):
  with open(name, 'wb') as output:
    pickle.dump(model, output)

def load_model(name):
  with open(name, 'rb') as input:
    model = pickle.load(input)
  return model

In [4]:
def load_df(url):
  df = pd.DataFrame
  df = pd.read_csv(url)
  del df['Unnamed: 0']
  return df

def save_df(df, url):
  df.to_csv(url)

In [5]:
import re
def vacancy_to_text(vacancies_df):
  vacancy_list = []
  for vacancy in vacancies_df['prepared_description']:
    vacancy = re.sub(r"'", '', vacancy)
    vacancy = re.sub(r"]", '', vacancy)
    vacancy = re.sub(r"\[", '', vacancy)
    vacancy = re.sub(r",", '', vacancy)
    vacancy_list.append(vacancy)
  return vacancy_list

# Модели

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(vacancy_list):
  count_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=3)
  bow = count_vectorizer.fit_transform(vacancy_list)
  save_model(count_vectorizer, '../models/count_vectorizer')
  print(len(count_vectorizer.vocabulary_))
  return bow

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(vacancy_list):
  tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True)
  tfidf = tfidf_vectorizer.fit_transform(vacancy_list)
  save_model(tfidf_vectorizer, '../models/tfidf_vectorizer')
  print(len(tfidf_vectorizer.vocabulary_))
  return tfidf

# main

In [12]:
vacancies_df = load_df('../data/prepared_description_vacancies.csv')
vacancies_df

Unnamed: 0,vacancy_id,raw_description,prepared_description
0,795348,"{'id': '795348', 'premium': False, 'billing_ty...","[['сфера', 'деятельность', 'розничный', 'торго..."
1,795347,"{'id': '795347', 'premium': False, 'billing_ty...","[['corporate', 'western', 'bank', 'initiates',..."
2,795353,"{'id': '795353', 'premium': False, 'billing_ty...","[['обязанность', 'осуществление', 'качественны..."
3,795350,"{'id': '795350', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'ведение', 'кадро..."
4,795352,"{'id': '795352', 'premium': False, 'billing_ty...","[['обязанность', 'приём', 'распределение', 'вх..."
...,...,...,...
10108,814105,"{'id': '814105', 'premium': False, 'billing_ty...","[['российский', 'компания'], ['м', 'речной', '..."
10109,814108,"{'id': '814108', 'premium': False, 'billing_ty...","[['требование', 'международный', 'группа', 'ко..."
10110,814113,"{'id': '814113', 'premium': False, 'billing_ty...","[['обязанность', 'продавец', 'консультант', 'м..."
10111,814119,"{'id': '814119', 'premium': False, 'billing_ty...","[['требование', 'профессиональный', 'знание', ..."


In [13]:
vacancies_df = vacancies_df.iloc[:3000, :]

In [23]:
vacansy_list = vacancy_to_text(vacancies_df)
bow = bag_of_words(vacansy_list)
tfidf = tf_idf(vacansy_list)

20554
13380


## bag_of_words

In [24]:
matrix_bow = bow.todense()
matrix_bow_list = matrix_bow.tolist()

In [25]:
vacancies_df = vacancies_df.assign(count_vectorizer=pd.Series(matrix_bow_list).values)

In [26]:
vacancies_df

Unnamed: 0,vacancy_id,raw_description,prepared_description,count_vectorizer
0,795348,"{'id': '795348', 'premium': False, 'billing_ty...","[['сфера', 'деятельность', 'розничный', 'торго...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,795347,"{'id': '795347', 'premium': False, 'billing_ty...","[['corporate', 'western', 'bank', 'initiates',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,795353,"{'id': '795353', 'premium': False, 'billing_ty...","[['обязанность', 'осуществление', 'качественны...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,795350,"{'id': '795350', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'ведение', 'кадро...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,795352,"{'id': '795352', 'premium': False, 'billing_ty...","[['обязанность', 'приём', 'распределение', 'вх...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
2995,800919,"{'id': '800919', 'premium': False, 'billing_ty...","[['обязанность', 'руководство', 'проект', 'нач...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2996,800917,"{'id': '800917', 'premium': False, 'billing_ty...","[['обязанность', 'сопровождение', 'договор', '...","[2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2997,800920,"{'id': '800920', 'premium': False, 'billing_ty...","[['российский', 'косметический', 'компания', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2998,800921,"{'id': '800921', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'руководство', 'о...","[2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


## tf_idf

In [27]:
matrix_tfidf = tfidf.todense()
matrix_tfidf_list = matrix_tfidf.tolist()

In [28]:
vacancies_df = vacancies_df.assign(tfidf_vectorizer=pd.Series(matrix_tfidf_list).values)

In [29]:
vacancies_df

Unnamed: 0,vacancy_id,raw_description,prepared_description,count_vectorizer,tfidf_vectorizer
0,795348,"{'id': '795348', 'premium': False, 'billing_ty...","[['сфера', 'деятельность', 'розничный', 'торго...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,795347,"{'id': '795347', 'premium': False, 'billing_ty...","[['corporate', 'western', 'bank', 'initiates',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,795353,"{'id': '795353', 'premium': False, 'billing_ty...","[['обязанность', 'осуществление', 'качественны...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,795350,"{'id': '795350', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'ведение', 'кадро...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,795352,"{'id': '795352', 'premium': False, 'billing_ty...","[['обязанность', 'приём', 'распределение', 'вх...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
2995,800919,"{'id': '800919', 'premium': False, 'billing_ty...","[['обязанность', 'руководство', 'проект', 'нач...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2996,800917,"{'id': '800917', 'premium': False, 'billing_ty...","[['обязанность', 'сопровождение', 'договор', '...","[2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.1442695711441709, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2997,800920,"{'id': '800920', 'premium': False, 'billing_ty...","[['российский', 'косметический', 'компания', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2998,800921,"{'id': '800921', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'руководство', 'о...","[2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0.10681281732795717, 0.0, 0.0, 0.0, 0.0, 0.0,..."


## save

In [30]:
save_df(vacancies_df, '../data/prepared_description_vacancies_vectorizer.csv')

# Тестирование

In [31]:
model = load_model('../models/count_vectorizer')
len(model.vocabulary_)

20554

In [32]:
model = load_model('../models/tfidf_vectorizer')
len(model.vocabulary_)

13380

In [33]:
df = load_df('../data/prepared_description_vacancies_vectorizer.csv')
df

Unnamed: 0,vacancy_id,raw_description,prepared_description,count_vectorizer,tfidf_vectorizer
0,795348,"{'id': '795348', 'premium': False, 'billing_ty...","[['сфера', 'деятельность', 'розничный', 'торго...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,795347,"{'id': '795347', 'premium': False, 'billing_ty...","[['corporate', 'western', 'bank', 'initiates',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,795353,"{'id': '795353', 'premium': False, 'billing_ty...","[['обязанность', 'осуществление', 'качественны...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,795350,"{'id': '795350', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'ведение', 'кадро...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,795352,"{'id': '795352', 'premium': False, 'billing_ty...","[['обязанность', 'приём', 'распределение', 'вх...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
2995,800919,"{'id': '800919', 'premium': False, 'billing_ty...","[['обязанность', 'руководство', 'проект', 'нач...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2996,800917,"{'id': '800917', 'premium': False, 'billing_ty...","[['обязанность', 'сопровождение', 'договор', '...","[2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.1442695711441709, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2997,800920,"{'id': '800920', 'premium': False, 'billing_ty...","[['российский', 'косметический', 'компания', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2998,800921,"{'id': '800921', 'premium': False, 'billing_ty...","[['основной', 'обязанность', 'руководство', 'о...","[2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0.10681281732795717, 0.0, 0.0, 0.0, 0.0, 0.0,..."
