In [1]:
import numpy as np 
import pandas as pd
from joblib import dump, load

from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score, auc
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC 

DATA_PATH = 'data/'

### Data

In [2]:
df = pd.read_csv(DATA_PATH + 'light_prepared_data.csv')
df.head(10)

Unnamed: 0,target,vacancy_1,vacancy_2,vacancy_3,vacancy_4,vacancy_5,vacancy_6,vacancy_7,vacancy_8,vacancy_9,...,cv_303,cv_304,cv_305,cv_306,cv_307,cv_308,cv_309,cv_310,cv_311,cv_312
0,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01692,0.02525,-0.00852,0.0365,-0.010956,0.0354,0.01955,0.007103,0.02689,-0.03876
1,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.04938,-0.01044,-0.01802,0.0471,-0.04593,0.00759,0.01305,0.01075,0.02934,-0.01372
2,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01761,-0.0985,-0.03284,0.0335,-0.0332,0.0331,-0.00678,0.051,0.0534,-0.0411
3,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01761,-0.0985,-0.03284,0.0335,-0.0332,0.0331,-0.00678,0.051,0.0534,-0.0411
4,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.0379,-0.01585,-0.001335,0.03073,0.002558,0.035,0.0214,0.03204,0.02356,-0.02785
5,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.0575,0.014595,-0.04987,0.05573,0.01996,0.01102,0.02995,-0.02322,0.05704,-0.06415
6,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.03412,-0.012344,-0.01,0.02478,-0.02739,0.01962,-0.002752,-0.00439,0.0304,-0.04514
7,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.00794,0.02284,-0.005898,0.0459,-0.02171,0.04556,0.01865,0.02054,0.018,-0.0673
8,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.02246,-0.0232,-0.00638,0.0768,-0.03049,0.045,0.03674,0.04178,0.02174,-0.035
9,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.03732,-0.01396,-0.01461,0.02638,-0.01892,0.05264,0.007202,0.02704,0.02286,-0.0424


In [3]:
df.shape

(64334, 625)

In [4]:
X, y = df.drop(columns=['target']), df['target']
X_fulltrain, X_test, y_fulltrain, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_fulltrain, y_fulltrain, test_size=0.2, random_state=42, stratify=y_fulltrain)

X_train.head(10)

Unnamed: 0,vacancy_1,vacancy_2,vacancy_3,vacancy_4,vacancy_5,vacancy_6,vacancy_7,vacancy_8,vacancy_9,vacancy_10,...,cv_303,cv_304,cv_305,cv_306,cv_307,cv_308,cv_309,cv_310,cv_311,cv_312
33415,0.0625,0.0824,0.037,-0.1004,-0.009476,0.01599,-0.0442,-0.02539,-0.01949,0.0225,...,0.02246,-0.0232,-0.00638,0.0768,-0.03049,0.045,0.03674,0.04178,0.02174,-0.035
51516,0.08936,0.0847,0.02184,-0.08923,0.01407,0.004345,-0.0328,-0.01189,-0.01254,0.04996,...,0.01312,-0.000911,-0.01723,0.07556,-0.00489,0.02467,0.01254,0.008446,0.04187,-0.0349
9774,0.01804,0.05457,0.03876,-0.0759,-0.005672,0.02403,-0.02345,-0.06168,-0.003038,0.02925,...,0.04828,-0.0658,-0.0275,0.0365,-0.06396,0.0763,0.005905,0.02475,0.04065,-0.007996
23688,0.04364,0.02667,0.00464,-0.07367,-0.03903,0.06964,-0.076,-0.11597,0.009575,-0.01753,...,0.0459,-0.01019,-0.0274,0.0774,-0.00573,0.03041,-0.001741,0.02043,0.04703,-0.02534
21265,0.03778,0.02924,0.007484,-0.0698,-0.0362,0.05527,-0.0742,-0.10815,0.015076,-0.01324,...,0.04578,0.01758,-0.0258,0.0463,-0.0482,0.02083,0.001442,0.02133,0.05,-0.04852
41993,0.05576,0.0657,0.0348,-0.06494,0.01181,-0.01666,-0.04105,-0.06168,0.02844,0.03912,...,0.05038,-0.01683,-0.016,0.04324,-0.0306,0.01133,0.02504,0.006065,0.0459,-0.03882
27810,0.0563,0.03464,0.04288,-0.06976,-0.000862,0.000509,-0.05927,-0.0784,0.0238,0.01143,...,0.01036,0.01862,0.01408,0.019,0.03244,0.03156,-0.003233,0.0818,0.006237,-0.00597
46969,0.0611,0.0693,0.0308,-0.0889,0.01349,0.02454,-0.02826,-0.02048,0.00167,0.03574,...,0.01721,-0.04443,-0.000669,0.01935,-0.03513,0.03375,0.014435,0.03044,0.03992,-0.05954
41202,0.05417,0.0775,0.03078,-0.1041,0.01582,0.00777,-0.02666,-0.02255,-0.03012,0.05112,...,0.0693,-0.02632,-0.03296,0.06793,-0.02266,0.03458,0.0434,-0.009705,0.0564,-0.03287
18459,0.0964,0.05414,0.02832,-0.08655,0.0203,0.03044,-0.04922,-0.02646,-0.01268,0.0318,...,0.02168,0.02637,-0.000685,0.0548,-0.02377,0.02441,-0.01569,0.03665,0.042,-0.02518


In [5]:
X_test.iloc[:, :312]

Unnamed: 0,vacancy_1,vacancy_2,vacancy_3,vacancy_4,vacancy_5,vacancy_6,vacancy_7,vacancy_8,vacancy_9,vacancy_10,...,vacancy_303,vacancy_304,vacancy_305,vacancy_306,vacancy_307,vacancy_308,vacancy_309,vacancy_310,vacancy_311,vacancy_312
59089,0.05386,0.00595,0.038330,-0.03473,0.004707,-0.02605,-0.03330,-0.10114,-0.004660,-0.00904,...,0.049220,-0.01845,0.015990,0.027180,0.02303,-0.002884,0.008770,0.055300,-0.007053,-0.016570
22952,0.04364,0.02667,0.004640,-0.07367,-0.039030,0.06964,-0.07600,-0.11597,0.009575,-0.01753,...,0.010970,-0.00495,-0.003304,-0.001562,0.01952,-0.002447,0.045750,0.029660,-0.003035,-0.061300
24997,0.04364,0.02667,0.004640,-0.07367,-0.039030,0.06964,-0.07600,-0.11597,0.009575,-0.01753,...,0.010970,-0.00495,-0.003304,-0.001562,0.01952,-0.002447,0.045750,0.029660,-0.003035,-0.061300
31173,0.06660,0.02333,0.010530,-0.04755,-0.017850,0.03506,-0.04230,-0.04020,0.030080,-0.01146,...,-0.006740,-0.04218,-0.011510,-0.010920,0.04742,0.039950,0.044460,0.006905,-0.000990,-0.039860
10651,0.02072,0.05154,0.006813,-0.09360,-0.019170,0.05545,-0.05304,-0.10140,-0.002304,-0.01261,...,0.001428,-0.03357,0.031170,0.006275,-0.02478,0.003880,0.019990,0.060300,0.031770,-0.009140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4124,0.07640,0.09350,0.035860,-0.10645,0.024550,0.01874,-0.01997,-0.04044,-0.002531,0.02316,...,0.028020,-0.06880,-0.013640,0.017840,0.00901,0.040130,-0.005802,0.066300,0.070400,0.024570
54450,0.04920,0.02760,0.003057,-0.08310,0.003899,0.06775,-0.06430,-0.07837,0.011910,-0.02757,...,0.046840,-0.01779,-0.008760,-0.006813,-0.01646,-0.020050,-0.000132,0.011220,-0.002916,-0.097100
58225,0.05502,0.03052,0.050800,-0.08630,0.007990,0.02588,-0.01416,-0.07250,0.023650,0.05250,...,0.031460,-0.04210,-0.000628,-0.027220,-0.01041,0.023570,-0.000687,0.052670,0.073900,-0.013120
44788,0.13200,0.00549,0.032260,-0.01209,0.024630,-0.04410,-0.06128,-0.07574,-0.008060,0.04443,...,0.071300,-0.07434,-0.028380,-0.026960,-0.01111,0.052600,0.021090,0.020520,0.061800,0.003557


In [6]:
X_test.iloc[:, 312:]

Unnamed: 0,cv_1,cv_2,cv_3,cv_4,cv_5,cv_6,cv_7,cv_8,cv_9,cv_10,...,cv_303,cv_304,cv_305,cv_306,cv_307,cv_308,cv_309,cv_310,cv_311,cv_312
59089,0.113650,-0.02655,-0.02954,-0.05160,0.035370,-0.011665,0.04180,-0.06177,0.013970,0.085940,...,0.010635,0.043370,0.017760,0.04117,-0.043060,0.012800,-0.061830,0.01593,0.037380,-0.01901
22952,-0.028100,0.08374,0.00997,-0.10785,0.006145,0.029240,-0.04633,-0.06550,-0.010130,-0.055050,...,-0.006330,0.060120,0.000777,0.03082,0.050170,-0.002548,0.038540,0.08730,0.053900,-0.08890
24997,-0.000939,0.03710,0.03360,-0.09980,0.005547,-0.004856,-0.01775,0.01098,-0.058600,0.023510,...,0.030700,-0.075260,-0.002537,0.05212,-0.038120,0.060500,-0.009605,0.03394,-0.005714,0.01347
31173,0.099240,0.00367,-0.02296,-0.04425,0.043100,-0.030320,-0.04610,-0.03890,0.016300,0.041080,...,0.043640,0.006370,-0.014800,0.05002,-0.039830,0.050050,0.020970,0.02100,0.022230,-0.02560
10651,0.045750,0.02115,-0.04280,-0.09850,0.041700,0.004760,-0.03047,-0.08984,0.000144,0.001643,...,0.004963,0.019710,0.028000,0.01726,0.007940,0.009094,0.000235,0.05914,0.015090,-0.02365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4124,0.078700,-0.04320,-0.03992,-0.04156,0.024810,-0.001302,-0.04530,-0.07355,0.055400,0.000456,...,0.057500,0.014595,-0.049870,0.05573,0.019960,0.011020,0.029950,-0.02322,0.057040,-0.06415
54450,0.107240,0.01622,0.00820,-0.03796,0.048650,-0.020970,-0.01866,-0.04364,0.015015,0.045170,...,0.019710,-0.036680,-0.012856,0.01735,0.011220,0.034240,-0.000201,0.03070,0.010155,-0.03006
58225,0.050500,0.04794,0.01387,-0.06366,0.012344,0.021590,-0.03928,-0.04083,-0.003613,0.038200,...,0.022460,-0.023200,-0.006380,0.07680,-0.030490,0.045000,0.036740,0.04178,0.021740,-0.03500
44788,0.056980,0.02307,-0.01330,-0.03460,0.019910,0.003090,-0.03800,-0.06300,0.007170,0.025540,...,0.062800,0.000734,-0.000538,0.05417,-0.027330,0.029940,0.020160,0.02115,0.044280,-0.03360


### Cosine similarity

In [23]:
res = cosine_similarity(X_test.iloc[:, :312], X_test.iloc[:, 312:]).diagonal()
roc_auc_score(y_true = y_test, y_score = res)

0.6956378611307241

### SVM

In [7]:
clf_svm = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=10)
clf_svm.fit(X_train, y_train) # train
similarities = clf_svm.decision_function(X_test)

roc_auc_score(y_true = y_test, y_score = similarities)



0.9149652062512283

### Ridge regression

In [14]:
clf_ridge = RidgeClassifierCV()
clf_ridge.fit(X_train, y_train)
similarities = clf_ridge.decision_function(X_test)

roc_auc_score(y_test, , average=None)

0.8998438284770092

### kNN

In [49]:
def dot(x):
    a = np.array(x.iloc[312:])
    b = np.array(x.iloc[:312])
    return np.dot(a,b)

In [51]:
similarities = X_test.apply(dot, axis=1)
roc_auc_score(y_test, similarities)

0.6956630465639353

### SVM hyperparameters tuning

In [24]:
cv = StratifiedKFold(n_splits=3)

# defining parameter range 
param_grid = {'C': [0.1, 0.5, 1, 5, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear', 'rbf']}  
  
grid = GridSearchCV(SVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6,), 
                    param_grid, refit = True, verbose = 3, n_jobs=-1, cv=cv, scoring='roc_auc') 
  
# fitting the model for grid search 
grid.fit(X_fulltrain, y_fulltrain) 

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV 2/3] END .....................C=0.1, gamma=1, kernel=rbf; total time=10.3min
[CV 1/3] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=31.8min




[CV 1/3] END ................C=0.1, gamma=0.1, kernel=linear; total time= 9.3min
[CV 1/3] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=31.7min
[CV 3/3] END ..............C=0.1, gamma=0.001, kernel=linear; total time=12.5min
[CV 3/3] END .............C=0.1, gamma=0.0001, kernel=linear; total time=12.6min
[CV 2/3] END .....................C=0.5, gamma=1, kernel=rbf; total time=15.5min
[CV 1/3] END ...............C=0.5, gamma=0.01, kernel=linear; total time=11.8min
[CV 1/3] END ..............C=0.5, gamma=0.001, kernel=linear; total time=11.9min
[CV 2/3] END .............C=0.5, gamma=0.0001, kernel=linear; total time=41.5min
[CV 1/3] END .......................C=1, gamma=1, kernel=rbf; total time=56.3min
[CV 3/3] END .....................C=1, gamma=0.1, kernel=rbf; total time=12.9min
[CV 1/3] END ...................C=1, gamma=0.001, kernel=rbf; total time=17.5min
[CV 3/3] END ..................C=1, gamma=0.0001, kernel=rbf; total time=17.8min
[CV 3/3] END ...............



[CV 2/3] END ................C=0.1, gamma=0.1, kernel=linear; total time= 9.4min
[CV 3/3] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=31.6min
[CV 1/3] END .................C=0.1, gamma=0.001, kernel=rbf; total time=15.7min
[CV 3/3] END ................C=0.1, gamma=0.0001, kernel=rbf; total time=16.2min
[CV 3/3] END ................C=0.5, gamma=0.1, kernel=linear; total time=12.4min
[CV 2/3] END ..................C=0.5, gamma=0.01, kernel=rbf; total time=15.7min
[CV 1/3] END .............C=0.5, gamma=0.0001, kernel=linear; total time=26.5min
[CV 2/3] END ....................C=1, gamma=1, kernel=linear; total time=69.9min
[CV 3/3] END ..................C=1, gamma=0.1, kernel=linear; total time= 9.4min
[CV 1/3] END ....................C=1, gamma=0.01, kernel=rbf; total time=15.0min
[CV 3/3] END ...................C=1, gamma=0.001, kernel=rbf; total time=18.0min
[CV 3/3] END ....................C=5, gamma=1, kernel=linear; total time=13.1min
[CV 2/3] END ...............



[CV 1/3] END .....................C=0.1, gamma=1, kernel=rbf; total time=10.2min
[CV 3/3] END ...............C=0.1, gamma=0.01, kernel=linear; total time=30.4min
[CV 1/3] END ..............C=0.1, gamma=0.001, kernel=linear; total time=12.3min
[CV 1/3] END .............C=0.1, gamma=0.0001, kernel=linear; total time=12.6min
[CV 3/3] END ..................C=0.5, gamma=1, kernel=linear; total time=12.3min
[CV 2/3] END ...................C=0.5, gamma=0.1, kernel=rbf; total time=15.4min
[CV 3/3] END ..................C=0.5, gamma=0.01, kernel=rbf; total time=15.9min
[CV 1/3] END ................C=0.5, gamma=0.0001, kernel=rbf; total time=86.8min
[CV 3/3] END .......................C=1, gamma=1, kernel=rbf; total time=12.0min
[CV 2/3] END ....................C=1, gamma=0.01, kernel=rbf; total time=15.2min
[CV 1/3] END ...............C=1, gamma=0.0001, kernel=linear; total time=13.5min
[CV 2/3] END ....................C=5, gamma=1, kernel=linear; total time=13.3min
[CV 2/3] END ...............



[CV 3/3] END .................C=0.1, gamma=0.001, kernel=rbf; total time=18.2min
[CV 2/3] END ..................C=0.5, gamma=1, kernel=linear; total time=15.6min
[CV 1/3] END ...................C=0.5, gamma=0.1, kernel=rbf; total time=18.8min
[CV 2/3] END ..............C=0.5, gamma=0.001, kernel=linear; total time=15.0min
[CV 2/3] END ................C=0.5, gamma=0.0001, kernel=rbf; total time=89.4min
[CV 2/3] END .....................C=1, gamma=0.1, kernel=rbf; total time=14.8min
[CV 1/3] END ................C=1, gamma=0.001, kernel=linear; total time=16.4min
[CV 2/3] END ..................C=1, gamma=0.0001, kernel=rbf; total time=21.7min
[CV 1/3] END .....................C=5, gamma=0.1, kernel=rbf; total time=73.6min
[CV 2/3] END ................C=5, gamma=0.001, kernel=linear; total time=10.8min
[CV 1/3] END ..................C=5, gamma=0.0001, kernel=rbf; total time=15.8min
[CV 1/3] END .................C=10, gamma=0.1, kernel=linear; total time=14.3min
[CV 1/3] END ...............



GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             estimator=SVC(class_weight='balanced', max_iter=10000, tol=1e-06),
             n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 1, 5, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf']},
             scoring='roc_auc', verbose=3)

In [28]:
print(grid.best_params_) 
print(grid.best_estimator_) 
print(grid.best_score_) 

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=10, class_weight='balanced', gamma=1, max_iter=10000, tol=1e-06)
0.9662165773120673


In [68]:
clf = grid.best_estimator_
similarities = clf.decision_function(X_test)

roc_auc_score(y_true = y_test, y_score = similarities)

0.969138417976285

In [29]:
dump(clf, 'models/svm_v1.joblib') 

['models/svm_v1.joblib']

### Score distribution

In [40]:
df_similarities = pd.DataFrame(y_test, columns=['target', 'similarity'])
df_similarities['similarity'] = similarities
df_similarities['quantile_gr1'] = pd.qcut(df_similarities['similarity'], q=10)
df_similarities['quantile_gr2'] = pd.cut(df_similarities['similarity'], bins=15)

In [58]:
df_similarities.groupby(['quantile_gr1']).agg({'target': ['mean', 'count']}) * 100

Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,mean,count
quantile_gr1,Unnamed: 1_level_2,Unnamed: 2_level_2
"(-4.008, -2.105]",0.4662,128700
"(-2.105, -1.808]",0.3885,128700
"(-1.808, -1.588]",0.311042,128600
"(-1.588, -1.404]",0.0777,128700
"(-1.404, -1.233]",0.621601,128700
"(-1.233, -1.069]",0.466563,128600
"(-1.069, -0.883]",1.165501,128700
"(-0.883, -0.627]",1.710731,128600
"(-0.627, -0.0546]",7.226107,128700
"(-0.0546, 2.518]",78.865579,128700


In [61]:
df_similarities.groupby(['quantile_gr2']).agg({'target': ['mean', 'count']}) * 100

Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,mean,count
quantile_gr2,Unnamed: 1_level_2,Unnamed: 2_level_2
"(-4.014, -3.572]",0.0,500
"(-3.572, -3.137]",0.0,3400
"(-3.137, -2.702]",1.204819,16600
"(-2.702, -2.267]",0.644122,62100
"(-2.267, -1.832]",0.30525,163800
"(-1.832, -1.397]",0.255195,274300
"(-1.397, -0.962]",0.615764,324800
"(-0.962, -0.527]",1.751235,222700
"(-0.527, -0.0922]",8.710801,86100
"(-0.0922, 0.343]",42.021277,37600


### Final metrics

In [86]:
def logistic(x):
    return 1 / (1 + exp(-x))

def predict_proba(model, data):
    return logistic(model.decision_function(data))

In [80]:
pred_test = clf.predict(X_test)
similarities = clf.decision_function(X_test)
probs_test = logistic(similarities)

print(f'Test SVM accuracy:  {accuracy_score(y_test, pred_test):.5f}')
print(f'Test SVM AUC-ROC: {roc_auc_score(y_test, probs_test):.5f}')

precision, recall, _ = precision_recall_curve(y_test, probs_test)
print(f'Test SVM AUC-PR: {auc(recall, precision):.5f}')

Test SVM accuracy:  0.96938
Test SVM AUC-ROC: 0.96914
Test SVM AUC-PR: 0.89268
