In [22]:
import numpy as np 
import pandas as pd

from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score, auc
from xgboost.sklearn import XGBClassifier
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

DATA_PATH = '../data/'

In [25]:
df = pd.read_csv(DATA_PATH + 'light_prepared_data.csv')
df.head(10)

Unnamed: 0,target,vacancy_1,vacancy_2,vacancy_3,vacancy_4,vacancy_5,vacancy_6,vacancy_7,vacancy_8,vacancy_9,...,cv_303,cv_304,cv_305,cv_306,cv_307,cv_308,cv_309,cv_310,cv_311,cv_312
0,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01692,0.02525,-0.00852,0.0365,-0.010956,0.0354,0.01955,0.007103,0.02689,-0.03876
1,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.04938,-0.01044,-0.01802,0.0471,-0.04593,0.00759,0.01305,0.01075,0.02934,-0.01372
2,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01761,-0.0985,-0.03284,0.0335,-0.0332,0.0331,-0.00678,0.051,0.0534,-0.0411
3,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.01761,-0.0985,-0.03284,0.0335,-0.0332,0.0331,-0.00678,0.051,0.0534,-0.0411
4,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.0379,-0.01585,-0.001335,0.03073,0.002558,0.035,0.0214,0.03204,0.02356,-0.02785
5,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.0575,0.014595,-0.04987,0.05573,0.01996,0.01102,0.02995,-0.02322,0.05704,-0.06415
6,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.03412,-0.012344,-0.01,0.02478,-0.02739,0.01962,-0.002752,-0.00439,0.0304,-0.04514
7,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.00794,0.02284,-0.005898,0.0459,-0.02171,0.04556,0.01865,0.02054,0.018,-0.0673
8,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.02246,-0.0232,-0.00638,0.0768,-0.03049,0.045,0.03674,0.04178,0.02174,-0.035
9,0.0,0.07,0.003983,0.02249,-0.0743,0.00631,0.01775,-0.02373,-0.06018,0.0415,...,0.03732,-0.01396,-0.01461,0.02638,-0.01892,0.05264,0.007202,0.02704,0.02286,-0.0424


In [26]:
from sklearn.model_selection import train_test_split

X, y = df.drop(columns=['target']), df['target']
X_fulltrain, X_test, y_fulltrain, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_fulltrain, y_fulltrain, test_size=0.2, random_state=42, stratify=y_fulltrain)

X_train.head(10)

Unnamed: 0,vacancy_1,vacancy_2,vacancy_3,vacancy_4,vacancy_5,vacancy_6,vacancy_7,vacancy_8,vacancy_9,vacancy_10,...,cv_303,cv_304,cv_305,cv_306,cv_307,cv_308,cv_309,cv_310,cv_311,cv_312
33415,0.0625,0.0824,0.037,-0.1004,-0.009476,0.01599,-0.0442,-0.02539,-0.01949,0.0225,...,0.02246,-0.0232,-0.00638,0.0768,-0.03049,0.045,0.03674,0.04178,0.02174,-0.035
51516,0.08936,0.0847,0.02184,-0.08923,0.01407,0.004345,-0.0328,-0.01189,-0.01254,0.04996,...,0.01312,-0.000911,-0.01723,0.07556,-0.00489,0.02467,0.01254,0.008446,0.04187,-0.0349
9774,0.01804,0.05457,0.03876,-0.0759,-0.005672,0.02403,-0.02345,-0.06168,-0.003038,0.02925,...,0.04828,-0.0658,-0.0275,0.0365,-0.06396,0.0763,0.005905,0.02475,0.04065,-0.007996
23688,0.04364,0.02667,0.00464,-0.07367,-0.03903,0.06964,-0.076,-0.11597,0.009575,-0.01753,...,0.0459,-0.01019,-0.0274,0.0774,-0.00573,0.03041,-0.001741,0.02043,0.04703,-0.02534
21265,0.03778,0.02924,0.007484,-0.0698,-0.0362,0.05527,-0.0742,-0.10815,0.015076,-0.01324,...,0.04578,0.01758,-0.0258,0.0463,-0.0482,0.02083,0.001442,0.02133,0.05,-0.04852
41993,0.05576,0.0657,0.0348,-0.06494,0.01181,-0.01666,-0.04105,-0.06168,0.02844,0.03912,...,0.05038,-0.01683,-0.016,0.04324,-0.0306,0.01133,0.02504,0.006065,0.0459,-0.03882
27810,0.0563,0.03464,0.04288,-0.06976,-0.000862,0.000509,-0.05927,-0.0784,0.0238,0.01143,...,0.01036,0.01862,0.01408,0.019,0.03244,0.03156,-0.003233,0.0818,0.006237,-0.00597
46969,0.0611,0.0693,0.0308,-0.0889,0.01349,0.02454,-0.02826,-0.02048,0.00167,0.03574,...,0.01721,-0.04443,-0.000669,0.01935,-0.03513,0.03375,0.014435,0.03044,0.03992,-0.05954
41202,0.05417,0.0775,0.03078,-0.1041,0.01582,0.00777,-0.02666,-0.02255,-0.03012,0.05112,...,0.0693,-0.02632,-0.03296,0.06793,-0.02266,0.03458,0.0434,-0.009705,0.0564,-0.03287
18459,0.0964,0.05414,0.02832,-0.08655,0.0203,0.03044,-0.04922,-0.02646,-0.01268,0.0318,...,0.02168,0.02637,-0.000685,0.0548,-0.02377,0.02441,-0.01569,0.03665,0.042,-0.02518


In [27]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(0.0    37415
 1.0     3758
 Name: target, dtype: int64,
 0.0    9354
 1.0     940
 Name: target, dtype: int64,
 0.0    11692
 1.0     1175
 Name: target, dtype: int64)

In [28]:
model = XGBClassifier(
    n_estimators=400,
    max_depth=3,
    learning_rate=0.1,
    # subsample=best['subsample'],
    # colsample_bytree=best['colsample_bytree'],
    eval_metric='auc',
    tree_method='hist',
    n_jobs=-1,
    use_label_encoder=False
)

model.fit(X_fulltrain, y_fulltrain, verbose=False)
pred_fulltrain = model.predict(X_fulltrain)
pred_test = model.predict(X_test)

print(f'Train XGBoost accuracy: {accuracy_score(y_fulltrain, pred_fulltrain):.5f}')
print(f'Test XGBoost accuracy:  {accuracy_score(y_test, pred_test):.5f}')

print(f'Train XGBoost AUC-ROC: {roc_auc_score(y_fulltrain, pred_fulltrain):.5f}')
print(f'Test XGBoost AUC-ROC: {roc_auc_score(y_test, pred_test):.5f}')

precision_train, recall_train, _ = precision_recall_curve(y_fulltrain, pred_fulltrain)
precision, recall, _ = precision_recall_curve(y_test, pred_test)
print(f'Train XGBoost AUC-PR: {auc(recall_train, precision_train):.5f}')
print(f'Test XGBoost AUC-PR: {auc(recall, precision):.5f}')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train XGBoost accuracy: 0.98304
Test XGBoost accuracy:  0.97109
Train XGBoost AUC-ROC: 0.91302
Test XGBoost AUC-ROC: 0.85357
Train XGBoost AUC-PR: 0.91372
Test XGBoost AUC-PR: 0.85023
