In [28]:
# Imports + настройки
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

TARGET = "salary_avg"
TEST_SIZE = 0.20
RANDOM_STATE = 42
CV_SPLITS = 5

In [29]:
df = pd.read_csv('processed_emed_careers_eu_v2.csv')

print("Размер датасета:", df.shape)
print("Первые 15 колонок:", list(df.columns)[:15], "...")
display(df.head())
display(df.describe(include="all").T.head(12))

Размер датасета: (13807, 149)
Первые 15 колонок: ['post_year', 'post_month', 'post_day', 'post_dayofweek', 'desc_len', 'desc_word_count', 'kw_remote', 'kw_bonus', 'kw_senior', 'kw_junior', 'kw_manager', 'kw_english', 'kw_german', 'kw_french', 'kw_visa'] ...


Unnamed: 0,post_year,post_month,post_day,post_dayofweek,desc_len,desc_word_count,kw_remote,kw_bonus,kw_senior,kw_junior,kw_manager,kw_english,kw_german,kw_french,kw_visa,kw_python,kw_sql,kw_ml,kw_phd,title_len,title_word_count,title_has_senior,title_has_junior,title_has_manager,salary_provided,salary_is_competitive,salary_numeric_found,salary_avg,category_Data Management and Statistics,category_France,category_Germany,category_Italy,category_Manufacturing & Operations,category_Medical Affairs / Pharmaceutical Physician,category_Medical Information and Pharmacovigilance,category_Pharmaceutical Marketing,"category_Pharmaceutical, Healthcare and Medical Sales",category_Pharmacy,category_Quality-assurance,category_Regulatory Affairs,category_Science,category_Spain,category_Switzerland,category_UK,category_science,job_type_Contract/Interim,job_type_Contract/Temp,job_type_Part-Time,job_type_Permanent,job_type_Temporary/Seasonal,company_name_40 RECRUITMENT LIMITED,company_name_ADVANCE RECRUITMENT,company_name_ADVANCED CLINICAL RECRUITMENT LIMITED,company_name_AL SOLUTIONS,company_name_ANDY FISH,company_name_APODI,company_name_ASHFIELD,company_name_AUSTIN FRASER,company_name_AXESS LTD,company_name_BARD LIMITED,company_name_BARRINGTON JAMES LTD,company_name_BCF RECRUITMENT LTD,company_name_BLACKFIELD ASSOCIATES,company_name_BLUE PELICAN LIMITED,company_name_BMS PERFORMANCE,company_name_CHASE SEARCH SELECTION LIMITED,company_name_CHASE SEARCH AMP SELECTION LIMITED,company_name_CHEMISTREE SOLUTIONS LTD,company_name_CK CLINICAL,company_name_CLINICAL PROFESSIONALS,company_name_COVANCE,company_name_CROS NT LIMITED,company_name_CSG,company_name_CW RECRUITMENT SPECIALISTS LTD,company_name_DISCOVER PEOPLE INTERNATIONAL LIMITED,company_name_DOCS INTERNATIONAL UK LIMITED,company_name_EF MEDICAL,company_name_EVOLVE SELECTION LIMITED,company_name_FRESH CONNECT RECRUITMENT CONSULTANTS LIMITED,company_name_G2 CLINICAL PROFESSIONAL RESOURCING,company_name_G2 CLINICAL AMP PROFESSIONAL RESOURCING,company_name_HARRIS LORD RECRUITMENT LIMITED,company_name_HAYS LIFE SCIENCES,company_name_HELIX RECRUITMENT LTD,company_name_HYPER RECRUITMENT SOLUTIONS LTD,company_name_ID SEARCH AND SELECTION LTD,company_name_IPHARM CONSULTING LTD,company_name_IQVIA LTD,company_name_IQVIATALENT MANAGEMENT CENTRE,company_name_KEY PEOPLE LIMITED,company_name_KIRKHAM YOUNG LTD,company_name_KLEBOE JARDINE LTD,company_name_MICHAEL BAILEY ASSOCIATES LIMITED,company_name_MSI GROUP LIMITED,company_name_NONSTOP RECRUITMENT,company_name_NORTH51 LTD,company_name_NOVELLA CLINICAL RESOURCING,company_name_OPTIMUS LIFE SCIENCES,company_name_OTHER,company_name_PENTLAND HOUSE,company_name_PLANET PHARMA STAFFING LIMITED,company_name_POPSCIENCE LIMITED,company_name_PPD GLOBAL LTD,company_name_PREMIER RESEARCH GROUP LIMITED,company_name_PROCLINICAL LTD,company_name_PROJECTUS LTD,company_name_QUOTIENT SCIENCES,company_name_RBW CONSULTING SOLUTIONS LTD,company_name_REAL RESOURCING,company_name_REMTEC SEARCH AND SELECTION,company_name_S E C RECRUITMENT LIMITED,company_name_SAPLING RECRUITMENT LTD,company_name_SELTEK CONSULTANTS LTD,company_name_SEVEN LIFE SCIENCES,company_name_SILCHESTER ASSOCIATES LTD,company_name_SKILLS ALLIANCE PHARMA LIMITED,company_name_STAR,company_name_SUGARMAN HEALTH WELLBEING,company_name_SYNEOS HEALTH,company_name_SYNEXUS LIMITED,company_name_THE VACANCY MANAGEMENT COMPANY,company_name_TRS CONSULTING,company_name_UK,company_name_UMBILICAL LIFE,company_name_VIFOR INTERNATIONAL AG,company_name_WARMAN OBRIEN LTD,company_name_X4 GROUP LTD,company_name_XCELLIN LTD,company_name_ZENOPA LTD,company_name_ZEST BUSINESS GROUP LIMITED,location_BIRMINGHAM,location_CAMBRIDGE,location_EUROPE,location_FRANCE,location_GERMANY,location_ITALY,location_LONDON,location_M4 CORRIDOR,location_MANCHESTER,location_NORTH WEST,location_OXFORD,location_PARIS,location_PORTUGAL,location_SCOTLAND,location_SOUTH EAST,location_SPAIN,location_SWITZERLAND,location_UK,location_UNKNOWN
0,2018.0,4.0,16.0,0.0,1905,291,1,0,1,1,1,1,0,0,0,0,0,0,0,22,3,0,0,1,0,0,0,51500.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2018.0,4.0,16.0,0.0,2536,374,0,0,0,0,0,0,0,0,0,0,0,0,0,39,5,0,0,0,0,0,0,27500.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2018.0,4.0,13.0,4.0,1481,210,0,0,0,0,0,0,0,0,0,0,0,0,0,24,3,0,0,0,0,0,0,45000.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2018.0,4.0,10.0,1.0,2618,336,0,1,1,0,0,0,0,0,0,0,0,0,0,33,4,1,0,0,0,0,0,32500.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,2018.0,4.0,12.0,3.0,3446,543,0,1,1,0,1,0,0,0,1,0,0,0,0,26,4,0,0,1,1,0,0,48750.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
post_year,13807.0,2018.0,0.0,2018.0,2018.0,2018.0,2018.0,2018.0
post_month,13807.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0
post_day,13807.0,9.962483,3.532004,3.0,7.0,11.0,13.0,17.0
post_dayofweek,13807.0,2.726298,1.596676,0.0,1.0,3.0,4.0,6.0
desc_len,13807.0,2452.615485,1024.330339,316.0,1769.0,2262.0,2924.0,9168.0
desc_word_count,13807.0,359.606794,145.286588,42.0,262.0,334.0,425.0,1342.0
kw_remote,13807.0,0.051568,0.221161,0.0,0.0,0.0,0.0,1.0
kw_bonus,13807.0,0.296299,0.456641,0.0,0.0,0.0,1.0,1.0
kw_senior,13807.0,0.553922,0.497102,0.0,0.0,1.0,1.0,1.0
kw_junior,13807.0,0.162092,0.368548,0.0,0.0,0.0,0.0,1.0


In [30]:
# Формируем X и y + проверки
if TARGET not in df.columns:
    raise ValueError(f"Целевая колонка '{TARGET}' не найдена в данных!")

# Убираем строки, где таргет пустой
df = df.dropna(subset=[TARGET]).copy()

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(float)

print("X shape:", X.shape)
print("y shape:", y.shape)

# Проверим, что в X нет строковых колонок
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Нечисловые колонки в X:", non_numeric_cols)

# Быстрый sanity-check по таргету
print("y mean:", float(y.mean()))
print("y min/max:", float(y.min()), float(y.max()))

X shape: (13807, 148)
y shape: (13807,)
Нечисловые колонки в X: []
y mean: 43039.60836894329
y min/max: 10000.0 162500.0


In [31]:
# Train/Test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)
print("Средняя зарплата (train):", float(y_train.mean()))
print("Средняя зарплата (test) :", float(y_test.mean()))

Train: (11045, 148) (11045,)
Test : (2762, 148) (2762,)
Средняя зарплата (train): 43065.69341330919
Средняя зарплата (test) : 42935.296524257785


In [32]:
# Демонстрация регуляризации

def evaluate_pipeline(pipe: Pipeline, X_tr, y_tr, X_te, y_te):
    pipe.fit(X_tr, y_tr)
    pred = pipe.predict(X_te)
    mae = mean_absolute_error(y_te, pred)
    rmse = np.sqrt(mean_squared_error(y_te, pred))
    r2 = r2_score(y_te, pred)
    return mae, rmse, r2

ridge_alpha_100 = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=100))
])

ridge_alpha_1 = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1))
])

mae_100, rmse_100, r2_100 = evaluate_pipeline(ridge_alpha_100, X_train, y_train, X_test, y_test)
mae_1, rmse_1, r2_1 = evaluate_pipeline(ridge_alpha_1, X_train, y_train, X_test, y_test)

print(f"Ridge alpha=100 | MAE: {mae_100:.2f} | RMSE: {rmse_100:.2f} | R2: {r2_100:.4f}")
print(f"Ridge alpha=1   | MAE: {mae_1:.2f} | RMSE: {rmse_1:.2f} | R2: {r2_1:.4f}")

Ridge alpha=100 | MAE: 8694.94 | RMSE: 13350.85 | R2: 0.5705
Ridge alpha=1   | MAE: 8692.71 | RMSE: 13269.47 | R2: 0.5757


In [33]:
# GridSearchCV (поиск по сетке) + CV внутри
# Идея: подбираем лучшую модель/параметры на TRAIN, используя 5-fold CV

cv = KFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge())  # заглушка: GridSearch заменит на Ridge/Lasso/ElasticNet
])

# Сетка параметров
param_grid = [
    # Ridge
    {"model": [Ridge()],
     "model__alpha": [0.01, 0.1, 1, 10, 100, 300, 1000]},
    # Lasso
    {"model": [Lasso(max_iter=20000)],
     "model__alpha": [0.001, 0.01, 0.1, 1, 10, 100]},
    # ElasticNet
    {"model": [ElasticNet(max_iter=20000)],
     "model__alpha": [0.001, 0.01, 0.1, 1, 10, 100],
     "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]},
]

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
best_params = grid.best_params_
best_cv_mae = -grid.best_score_

print("\n====================")
print("BEST MODEL:", best_model)
print("BEST PARAMS:", best_params)
print("BEST CV MAE:", best_cv_mae)
print("====================")

Fitting 5 folds for each of 43 candidates, totalling 215 fits

BEST MODEL: Pipeline(steps=[('scaler', StandardScaler()),
                ('model', Lasso(alpha=100, max_iter=20000))])
BEST PARAMS: {'model': Lasso(max_iter=20000), 'model__alpha': 100}
BEST CV MAE: 8648.77100319652


In [34]:
# Финальная оценка на Test

y_pred = best_model.predict(X_test)

test_mae = mean_absolute_error(y_test, y_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

print("TEST METRICS")
print("MAE :", round(test_mae, 2))
print("RMSE:", round(test_rmse, 2))
print("R2  :", round(test_r2, 4))

# небольшой контроль "насколько ошибка большая относительно масштаба зарплаты"
print("\nСправка:")
print("Средняя зарплата (test):", round(float(y_test.mean()), 2))
print("MAE как доля от средней:", round(float(test_mae / y_test.mean()), 4))

TEST METRICS
MAE : 8666.02
RMSE: 13461.11
R2  : 0.5634

Справка:
Средняя зарплата (test): 42935.3
MAE как доля от средней: 0.2018


In [35]:
# Cross-Validation

scoring = {
    "MAE": "neg_mean_absolute_error",
    "RMSE": "neg_root_mean_squared_error",
    "R2": "r2"
}

cv_scores = cross_validate(
    best_model,
    X_train, y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1
)

cv_df = pd.DataFrame(cv_scores)

# Превращаем ошибки из отрицательных в положительные
for col in ["test_MAE", "train_MAE", "test_RMSE", "train_RMSE"]:
    cv_df[col] = -cv_df[col]

display(cv_df)
display(cv_df.mean(numeric_only=True))

Unnamed: 0,fit_time,score_time,test_MAE,train_MAE,test_RMSE,train_RMSE,test_R2,train_R2
0,0.04538,0.003359,8381.953031,8637.010388,12686.980228,13046.739899,0.573118,0.562584
1,0.040615,0.002862,8706.842236,8519.161594,13273.171987,12912.278936,0.543038,0.569217
2,0.046528,0.0032,8617.820819,8568.389924,12572.296834,13073.637385,0.584813,0.559762
3,0.067892,0.001768,8734.069033,8462.25885,13669.770996,12808.477742,0.52625,0.573669
4,0.046349,0.002284,8803.169897,8527.877832,13303.233593,12890.755672,0.552917,0.567771


fit_time          0.049353
score_time        0.002695
test_MAE       8648.771003
train_MAE      8542.939718
test_RMSE     13101.090728
train_RMSE    12946.377927
test_R2           0.556027
train_R2          0.566601
dtype: float64