In [None]:
!pip install -q catboost
!pip install -q geopy
!pip install -q eli5

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

import os

import scipy
import eli5
plt.rcParams['figure.figsize'] = [12, 6]


%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Обработка данных

In [None]:
def load_dataset(name, path='/content/drive/My Drive/Hacaton/Data/'):
  if os.path.exists(path+name):
    return pd.read_csv(path + name, low_memory=False, sep=';', index_col=0)
  print(path+name)
  raise FileNotFoundError

In [None]:
def save_dataset(df, name, path='/content/drive/My Drive/Hacaton/Data/'):
  df.to_csv(path + name, sep=';')

In [None]:
def parse_date(df, name_column_date):
    series_date_datetime = pd.to_datetime(df[name_column_date])
    df.loc[:, 'year'] = series_date_datetime.apply(lambda x: x.year)
    df.loc[:, 'month'] = series_date_datetime.apply(lambda x: x.month)
    df.loc[:, 'day'] = series_date_datetime.apply(lambda x: x.day)
    df.loc[:, 'hour'] = series_date_datetime.apply(lambda x: x.hour)

In [None]:
def clean_data(df):
  # drop useless columns
  data_vacs_clean = df.drop(columns=['id', 'name', 'area.name', 'company_link', 'salary_currency',
                                           'employment.name', 'schedule.name', 'experience.name', 
                                           'description','type'])
  parse_date(data_vacs_clean, 'publication_date')
  data_vacs_clean = data_vacs_clean.drop(columns=['publication_date'])
  # simplify key_skills format
  data_vacs_clean.loc[:, "key_skills"] = (data_vacs_clean.key_skills.astype(str) != "nan").astype(int)
  return data_vacs_clean

In [None]:
def spec_modif(elem):
  lst = elem.split()
  for i in range(len(lst)):
    lst[i] = int(lst[i].split('.')[0])
  
  first_mode = max(set(lst), key=lst.count)
  second_mode = 0

  if len(set(lst)) >= 2:
    lst = list(filter(lambda a: a != first_mode, lst))
    second_mode = max(set(lst), key=lst.count)

  return first_mode, second_mode

In [None]:
# или выделяем все специализации
def spec_modif_all(df):
  def _process_specs(elem):
    specs = set()
    for spec in elem.split():
      specs.add(int(spec.split('.')[0]))
    return specs
  indexes = df.index
  df.loc[:, "specializations"] = pd.DataFrame(df.loc[:, "specializations"]).applymap(_process_specs)
  for i in range(1, 30):
    df[str(i)] = 0
  for index in indexes:
    for spec in df.loc[index, "specializations"]:
      df.loc[index, str(spec)] = 1
  df = df.drop("specializations", axis=1)


In [None]:
def add_specification_features(df):
  df["first_spec"] = 0
  df["second_spec"] = 0
  for index, row in tqdm(df.iterrows()):
      df.loc[index, ["first_spec", "second_spec"]] = spec_modif(row["specializations"])
  df["spec_split"] = df["specializations"].apply(lambda x: " ".join(str(x).split('.')))

In [None]:
# или выделяем все специализации
'''
indexes = data_vacs_clean.index

def spec_modif(elem):
  specs = set()
  for spec in elem.split():
    specs.add(int(spec.split('.')[0]))
  return specs

data_vacs_clean.loc[:, "specializations"] = pd.DataFrame(data_vacs_clean.loc[:, "specializations"]).applymap(spec_modif)

for i in range(1, 30):
  data_vacs_clean[str(i)] = 0
for index in indexes:
  for spec in data_vacs_clean.loc[index, "specializations"]:
    data_vacs_clean.loc[index, str(spec)] = 1

data_vacs_clean = data_vacs_clean.drop("specializations", axis=1)
'''

'\nindexes = data_vacs_clean.index\n\ndef spec_modif(elem):\n  specs = set()\n  for spec in elem.split():\n    specs.add(int(spec.split(\'.\')[0]))\n  return specs\n\ndata_vacs_clean.loc[:, "specializations"] = pd.DataFrame(data_vacs_clean.loc[:, "specializations"]).applymap(spec_modif)\n\nfor i in range(1, 30):\n  data_vacs_clean[str(i)] = 0\nfor index in indexes:\n  for spec in data_vacs_clean.loc[index, "specializations"]:\n    data_vacs_clean.loc[index, str(spec)] = 1\n\ndata_vacs_clean = data_vacs_clean.drop("specializations", axis=1)\n'

# Доп. обработка данных

In [None]:
def spec_modif_ten(df):  # df was from vacs_train_clean_adv.csv. Load it?
  df_new = df.copy()
  # new columns for specializations
  for i in range(10):
    df_new['spec' + str(i) + '**'] = 0
  
  # fill these columns
  for index, row in (df_new.iterrows()):
    spec_lst = list(row.specializations.split())
    for spec in spec_lst:
      spec_num = int(spec.split('.')[1])
      df_new.loc[index, 'spec' + str(spec_num // 100) + '**'] = spec_num
  
  # fill empty cities
  df_new = df_new.fillna('unknown')
  return df_new

# Добавление координаты городов

In [None]:
def add_city_coord_feature(df): # this is the crapiest function i've ever seen
  df_city = df.copy()
  df_city['city'].fillna(value='Cанкт-Петербург', inplace=True)
  df_city.loc[df_city['city'] == 'unknown', 'city'] ='Cанкт-Петербург' 
  if df_city is None:
    raise AssertionError
  geolocator = Nominatim()
  geocode = RateLimiter(geolocator.geocode, min_delay_seconds= 0.1)
  def get_adress(adr):
    loc = geocode(adr)
    if loc is not None:
      return ','.join(map(str, [loc.latitude, loc.longitude]))
    else:
      return '59.9606739,30.1586551'

  uniq_city = pd.Series(df_city['city'].unique())
  location = uniq_city.apply(get_adress)

  mask = pd.Series(location.to_list(),index=uniq_city)
  coord_city = df_city['city'].apply(lambda x: mask[x]).str.split(',',expand=True)
  df_city['coord_lat'] = pd.to_numeric(coord_city[0])
  df_city['coord_lon'] = pd.to_numeric(coord_city[1])
  return df_city

# Предсказание среднего


In [None]:
def model_predict(regressor, X_train, X_test, y_train, y_test):
  # regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)
  print_metrics(y_test, y_pred)
  # show_error_hist(y_test, y_pred)

In [None]:
import pickle
def save_model_pickle(model, name, path='/content/drive/My Drive/Hacaton/Models_Pickled/'):
  pickle.dump(model, open(path + name, 'wb'))


def save_model_catboost(model, name, path='/content/drive/My Drive/Hacaton/Models_Pickled/'):
  model.save_model(path+name, format="cbm")

In [None]:
def custom_split(df):
  """
  return dataframe for X, values for y
  """ 
  X = df.drop(columns=["salary_from"])
  y = df.loc[:, ["salary_from"]]
  TMP = train_test_split(X, y, test_size=0.3, random_state=17)
  return TMP[0], TMP[1], TMP[2].values, TMP[3].values
  

In [None]:
def print_metrics(y_test, y_pred):
  print("MAE =", int(mean_absolute_error(y_test, y_pred)))
  print("MSE =", int(mean_squared_error(y_test, y_pred)))
  print("R2 =", r2_score(y_test, y_pred).round(3))

In [None]:
def show_error_hist(y_test, y_pred):
  print(y_test.shape)
  print(y_pred.shape)
  result_df_stupid = pd.DataFrame(np.concatenate((y_test, y_pred, y_test - y_pred), axis=1),
                         columns=["y_test", "y_pred", "error"])
  plt.rcParams['figure.figsize'] = [12, 6]
  result_df_stupid.error.hist(bins=300)

In [None]:
def avarage_predict(X_train, X_test, y_train, y_test):
  if df is None:
    df = load_dataset('vacs_city.csv')
  y_pred = np.array([y.mean()] * len(y_test))
  print_metrics(y_test, y_pred)
  show_error_hist(y_test, y_pred)

# Линейная регрессия

In [None]:
def one_hot_split(df):
  X = df.drop(columns=["salary_from", "name.lemm", "description.lemm", "city", "specializations"])
  y = df.loc[:, ["salary_from"]]
  X_one_hot = pd.get_dummies(X)
  TMP = train_test_split(X_one_hot, y, test_size=0.3, random_state=17)
  return TMP[0].values, TMP[1].values, TMP[2].values, TMP[3].values

In [None]:
def linear_predict(df):
  if df is None:
    df = load_dataset('vacs_city.csv')

  X_train, X_test, y_train, y_test = one_hot_split(df)

  LinReg = LinearRegression()
  LinReg.fit(X_train, y_train)

  y_pred = LinReg.predict(X_test)

  print_metrics(y_test, y_pred)
  show_error_hist(y_test, y_pred)
  print_feature_importances(regressor)


# SVM

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

def svm_predict(df):
  if df is None:
    df = load_dataset('vacs_city.csv')

  X_train, X_test, y_train, y_test = one_hot_split(df)

  svm_clf = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1e3, gamma=0.1, cache_size=2000))

  svm_clf.fit(X_train[:10000], y_train[:10000])  # cause svm sucks anyway

  y_pred = svm_clf.predict(X_test)

  print_metrics(y_test, y_pred)

# CatBoost

In [None]:
def print_feature_importances(clf):
  for name, imp in zip(X.head(0), clf.feature_importances_):
    print(str(name) + " : " + str(imp))

In [None]:
def catboost_predict(X_train, X_test, y_train, y_test, iterations=1000):
  catboost = CatBoostRegressor(iterations=iterations, random_seed=17)
  catboost.fit(X_train, y_train, cat_features=list(range(X_train.shape[1] - 4)), plot=True)
  save_model_catboost(catboost, 'catboost.cbm')
  print("catboost fitted")
  model_predict(catboost, X_train, X_test, y_train, y_test)
  

# Повышаем Score

In [None]:
def add_len_feature(df=None):
  if df is None:
    df = load_dataset('vacs_city.csv')
  df_len = df.copy()
  df_len["description_len"] = df_len["description.lemm"].apply(len)
  return df_len

# TF-IDF

In [None]:
def tf_idf_predictions(df_train, df_test, y_train, y_test, iterations=10):
  vectorizer_1 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=30000)
  X_train_text_1 = vectorizer_1.fit_transform(df_train.loc[:, "description.lemm"])
  save_model_pickle(vectorizer_1, 'vectorizer_1.pkl')
  X_test_text_1 = vectorizer_1.transform(df_test.loc[:, "description.lemm"])
  
  vectorizer_2 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=15000)
  X_train_text_2 = vectorizer_2.fit_transform(df_train.loc[:, "name.lemm"])
  save_model_pickle(vectorizer_2, 'vectorizer_2.pkl')
  X_test_text_2 = vectorizer_2.transform(df_test.loc[:, "name.lemm"])

  vectorizer_3 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=15000)
  X_train_text_3 = vectorizer_3.fit_transform(df_train.loc[:, "city"])
  save_model_pickle(vectorizer_3, 'vectorizer_3.pkl')
  X_test_text_3 = vectorizer_3.transform(df_test.loc[:, "city"])

  vectorizer_4 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=15000)
  X_train_text_4 = vectorizer_4.fit_transform(df_train.loc[:, "company"])
  save_model_pickle(vectorizer_4, 'vectorizer_4.pkl')
  X_test_text_4 = vectorizer_4.transform(df_test.loc[:, "company"])

  vectorizer_5 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=15000)
  X_train_text_5 = vectorizer_5.fit_transform(df_train.loc[:, "key_skills"])
  save_model_pickle(vectorizer_5, 'vectorizer_5.pkl')
  X_test_text_5 = vectorizer_5.transform(df_test.loc[:, "key_skills"])

  vectorizer_6 = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=15000)
  X_train_text_6 = vectorizer_6.fit_transform(df_train.loc[:, "specializations.names"])
  save_model_pickle(vectorizer_6, 'vectorizer_6.pkl')
  X_test_text_6 = vectorizer_6.transform(df_test.loc[:, "specializations.names"])

  X_train_full = scipy.sparse.hstack([X_train_text_1, X_train_text_2, X_train_text_3,
                                      X_train_text_4, X_train_text_5, X_train_text_6]).tocsr()
  X_test_full = scipy.sparse.hstack([X_test_text_1, X_test_text_2, X_test_text_3,
                                  X_test_text_4, X_test_text_5, X_test_text_6]).tocsr()

  # possibly safe/load model here
  # CatBoost = CatBoostRegressor(iterations=iterations, random_seed=17, task_type='GPU') # rewrite to model.predict
  # CatBoost.fit(X_train_full, y_train, plot=True)

  # save_model_catboost(CatBoost, 'Tf-IdfCatNotFull.cbm')

  print(X_train_full.shape)

  y_pred_tf_idf = CatBoost.predict(X_test_full)
  y_pred_tf_idf_on_train = CatBoost.predict(X_train_full)
  print_metrics(y_test, y_pred_tf_idf)

  return y_pred_tf_idf_on_train, y_pred_tf_idf
  # eli5.show_weights(estimator=CatBoost,
  #                 feature_names=(list(vectorizer.get_feature_names())),
  #                 top=50)





In [None]:
def tf_idf_as_feature(X_train, X_test, y_train, y_test, iterations=100):

  y_pred_tf_idf_on_train, y_pred_tf_idf_on_test = tf_idf_predictions(X_train, X_test, y_train, y_test, iterations=iterations)  # !!! change iterations

  y_pred_tf_idf_on_test = y_pred_tf_idf_on_test.reshape((len(y_pred_tf_idf_on_test), 1))
  y_pred_tf_idf_on_train = y_pred_tf_idf_on_train.reshape((len(y_pred_tf_idf_on_train), 1))

  X_test_with_tf_idf = np.concatenate([X_test, y_pred_tf_idf_on_test], axis=1)
  X_train_with_tf_idf = np.concatenate([X_train, y_pred_tf_idf_on_train], axis=1)
  return X_train_with_tf_idf, X_test_with_tf_idf

# main

In [None]:
initial_df = load_dataset('vacs_train.csv')

In [None]:
cleaned_df = clean_data(initial_df)

In [None]:
add_specification_features(cleaned_df)

100001it [03:28, 480.34it/s]


In [None]:
cleaned_df = spec_modif_ten(cleaned_df)

In [None]:
cleaned_df = add_city_coord_feature(cleaned_df)



In [None]:
cleaned_df = add_len_feature(cleaned_df)

In [None]:
cleaned_df.head(2)

Unnamed: 0_level_0,name.lemm,city,company.id,salary_from,employment,schedule,experience,key_skills,specializations,description.lemm,year,month,day,hour,first_spec,second_spec,spec0**,spec1**,spec2**,spec3**,spec4**,spec5**,spec6**,spec7**,spec8**,spec9**,coord_lat,coord_lon,description_len
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
0,менеджер туризм,Санкт-Петербург,605490,40000.0,full,fullDay,between1And3,0,17.334 17.242 17.149 22.223 22.39 22.199,обязанность работа турист физический юридическ...,2016,8,21,14,17,22,39,199,223,334,0,0,0,0,0,0,59.960674,30.158655,818
1,помощник руководитель,Cанкт-Петербург,605490,40000.0,full,fullDay,between1And3,0,4.205 4.429 6.319 6.247 2.249,вакансия открывать рамка отдел строительный от...,2016,8,21,14,4,6,0,0,249,319,429,0,0,0,0,0,59.930287,30.367073,374


In [None]:
final_df = add_len_feature(load_dataset('vacs_clean_adv.csv'))  # cleaned_df.copy()

In [None]:
df_train, df_test, y_train, y_test = custom_split(final_df)

In [None]:
df_train.head(1)

Unnamed: 0_level_0,name.lemm,city,company.id,company,employment,schedule,experience,key_skills,specializations,specializations.names,description.lemm,year,month,day,hour,first_spec,second_spec,spec_split,coord_lat,coord_lon,description_len
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37779,специалист работа акт оказание услуга документ,Cанкт-Петербург,490,"Ренессанс cтрахование, Группа",full,fullDay,noExperience,пусто,4.429 4.181 5.51 15.388 15.281 19.170,Административный персонал - Делопроизводство |...,ренессанс страхование предлагать специалист ка...,2016,8,19,17,4,15,4 429 4 181 5 51 15 388 15 281 19 170,59930,30367,1580


In [None]:
X_train, X_test = tf_idf_as_feature(df_train, df_test, y_train, y_test, iterations=10)

In [None]:
X_train.shape

(70000, 22)

In [None]:
catboost_predict(X_train, X_test, y_train, y_test, iterations=1000)

# Предсказание тестовых