In [None]:
!pip install catboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from catboost import Pool
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
import time
import xgboost as xgb
from matplotlib import pylab as plt

In [None]:

df = pd.read_csv("ds_salaries.csv")
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [None]:
X = df.drop("salary_in_usd", axis = 1)
y = df["salary_in_usd"]

X_train, X1, y_train, y1 = train_test_split(X, y, test_size=0.2, random_state=20)
X_val, X_test, y_val, y_test = train_test_split(X1, y1, test_size=0.5,  random_state=20)

X_train = X_train.drop("salary", axis = 1)
X_test = X_test.drop("salary", axis = 1)
X_val = X_val.drop("salary", axis = 1)




In [None]:
X_val.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,employee_residence,remote_ratio,company_location,company_size
193,2023,SE,FT,Data Scientist,USD,US,0,US,M
2399,2022,SE,FT,Data Scientist,USD,US,0,US,M
45,2023,SE,FT,Data Engineer,USD,US,0,US,M
255,2023,SE,FT,Research Engineer,USD,US,0,US,M
3320,2022,SE,FT,Data Scientist,USD,US,100,US,M


In [None]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_encoded = enc.fit_transform(X_train)
X_val_encoded = enc.transform(X_val)
X_test_encoded = enc.transform(X_test)
model = LinearRegression()
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)

mape_val = mean_absolute_percentage_error(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print('MAPE на валидационной выборке: ', mape_val)
print('RMSE на валидационной выборке: ', rmse_val)

y_test_pred = model.predict(X_test_encoded)

mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('MAPE на тестовой выборке: ', mape_test)
print('RMSE на тестовой выборке: ', rmse_test)




MAPE на валидационной выборке:  7357146135.4822035
RMSE на валидационной выборке:  5193727275414459.0
MAPE на тестовой выборке:  77447896815.8573
RMSE на тестовой выборке:  9257911648289612.0


In [None]:
params = {
    'max_depth': [1, 3],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [50, 100],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3],
}

model_xgb = XGBRegressor()
model_xgb.fit(X_train_encoded, y_train, eval_metric="auc")
gsearch = GridSearchCV(estimator=model_xgb, param_grid=params, scoring="neg_mean_absolute_percentage_error", n_jobs=-1, cv=5)

gsearch.fit(X_val_encoded, y_val)

best_params = gsearch.best_params_
print(best_params)

model_xgb.gamma = 0
model_xgb.learning_rate = 0.05
model_xgb.max_depth = 3
model_xgb.n_estimators = 100
model_xgb.min_child_weight = 3
start_time = time.time()
model_xgb.fit(X_train_encoded, y_train)
end_time = time.time()
time_XGB = end_time - start_time
y_test_pred = model_xgb.predict(X_test_encoded)
mape_XGB = mean_absolute_percentage_error(y_test, y_test_pred)
rmse_XGB = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('MAPE на тестовой выборке: ', mape_XGB)
print('RMSE на тестовой выборке: ', rmse_XGB)
print('Время обучения: ', time_XGB)








{'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
MAPE на тестовой выборке:  0.4937659190521507
RMSE на тестовой выборке:  46295.56985030311
Время обучения:  5.00376558303833


In [None]:
params = {
    'depth' : [1, 3],
    'learning_rate' : [0.01, 0.05],
    'iterations' : [100, 500],
    'l2_leaf_reg': [1, 3]

}

cat = CatBoostRegressor(verbose = 0)
cat.fit(X_train_encoded, y_train)
gsearch = GridSearchCV(estimator=cat, param_grid=params, cv=3, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
gsearch.fit(X_val_encoded, y_val)
print(gsearch.best_params_)

cat.depth = 3
cat.iterations = 500
cat.l2_leaf_reg = 3
cat.learning_rate = 0.05


start_time = time.time()
cat.fit(X_train_encoded, y_train)
end_time = time.time()
time_cat = end_time - start_time
y_test_pred = cat.predict(X_test_encoded)
mape_cat = mean_absolute_percentage_error(y_test, y_test_pred)
rmse_cat = np.sqrt(mean_squared_error(y_test, y_test_pred))
print('MAPE на тестовой выборке: ', mape_cat)
print('RMSE на тестовой выборке: ', rmse_cat)
print('Время обучения: ', time_cat)


{'depth': 3, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
MAPE на тестовой выборке:  0.44209555705022563
RMSE на тестовой выборке:  45884.89580492139
Время обучения:  4.6000306606292725


In [None]:
cat_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

cat1 = CatBoostRegressor(verbose = 0, cat_features=cat_features)
gsearch = GridSearchCV(estimator=cat1, param_grid=params, cv=3, scoring='neg_mean_absolute_percentage_error', n_jobs=-1)
gsearch.fit(X_train, y_train)
print(gsearch.best_params_)

cat1.depth = 3
cat1.iterations = 500
cat1.l2_leaf_reg = 1
cat1.learning_rate = 0.05
start_time = time.time()
cat1.fit(train_pool)
end_time = time.time()
time_cat_with_pool = end_time - start_time
y_pred = cat1.predict(test_pool)
mape_cat_with_pool = mean_absolute_percentage_error(y_test, y_pred)
rmse_cat_with_pool = np.sqrt(mean_squared_error(y_test, y_pred))
print('MAPE на тестовой выборке: ', mape_cat_with_pool)
print('RMSE на тестовой выборке: ', rmse_cat_with_pool)
print('Время обучения: ', time_cat_with_pool)


{'depth': 3, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.05}
MAPE на тестовой выборке:  0.5240777526555472
RMSE на тестовой выборке:  48523.6010061081
Время обучения:  8.31374979019165


In [None]:
params = {
    'max_depth': [-1, 3, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'num_leaves': [10, 50, 100],

}
model = LGBMRegressor()

grid_search = GridSearchCV(model, params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_val_encoded, y_val)

best_params = grid_search.best_params_
print(best_params)
model.max_depth = 3
model.learning_rate = 0.01
model.num_leaves = 10
model.n_estimators = 500

start_time = time.time()
model.fit(X_train_encoded, y_train)
end_time = time.time()
time_LGBM = end_time - start_time

y_pred = model.predict(X_test_encoded)

mape_LGBM = mean_absolute_percentage_error(y_test, y_pred)
rmse_LGBM = np.sqrt(mean_squared_error(y_test, y_pred))
print('MAPE на тестовой выборке: ', mape_LGBM)
print('RMSE на тестовой выборке: ', rmse_LGBM)
print('Время обучения: ', time_LGBM)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 375, number of used features: 19
[LightGBM] [Info] Start training from score 136899.056000
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'num_leaves': 10}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 3004, number of used features: 47
[LightGBM] [Info] Start training from score 138675.276298
MAPE на тестовой выборке:  0.48477761347357445
RMSE на тестовой выборке:  46557.06879451289
Время обучения

**Вывод.**


*   Из всех четырех моделей лучше всего по характеристике MAPE показала себя модель catboost, на втором месте с небольшим разрывом по точности оказалась модель lightGBM. Хуже всего себя показала модель catboost с использованием pool для передачи данных в модель.
*   По характеристике RMSE результат оказался схожим. Лучший результат показала модель catboost, далее за ней с небольшим отставанием идет lightGBM. Также хуже всего себя показала catboost с использованием pool.

Быстрее всего обучалась модель lightGBM, причем с достаточно большим отрывом. А медленнее всего обе модели catboost'a.

На основе этого можно сказать, что catboost предлагает наилучшую точность, но требует больше времени на обучение. Это подходит для задач, где высокая точность является приоритетом, а время обучения не критично. В свою очередь lightGBM обеспечивает быструю и достаточно точную модель. Это делает его предпочтительным выбором в случаях, когда важно быстрое обучение при не самом лучшем, но все еще высоком уровне точности.



In [None]:
from sklearn.cluster import KMeans

In [None]:
ratings = pd.read_excel("https://github.com/evgpat/edu_stepik_rec_sys/blob/main/datasets/sample_matrix.xlsx?raw=true", engine='openpyxl')
ratings.head()



Unnamed: 0,user,the beatles,radiohead,deathcab for cutie,coldplay,modest mouse,sufjan stevens,dylan. bob,red hot clili peppers,pink fluid,...,municipal waste,townes van zandt,curtis mayfield,jewel,lamb,michal w. smith,群星,agalloch,meshuggah,yellowcard
0,0,,0.020417,,,,,,0.030496,,...,,,,,,,,,,
1,1,,0.184962,0.024561,,,0.136341,,,,...,,,,,,,,,,
2,2,,,0.028635,,,,0.024559,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,0.043529,0.086281,0.03459,0.016712,0.015935,,,,,...,,,,,,,,,,


In [None]:
ratings_T = ratings.T
ratings_end = ratings_T[ratings_T.index != "user"]
ratings_end

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
the beatles,,,,,0.043529,,,,0.093398,0.017621,...,,,0.121169,0.038168,0.007939,0.017884,,0.076923,,
radiohead,0.020417,0.184962,,,0.086281,0.006322,,,,0.019156,...,0.017735,,,,0.011187,,,,,
deathcab for cutie,,0.024561,0.028635,,0.034590,,,,,0.013349,...,0.121344,,,,,,,,,0.027893
coldplay,,,,,0.016712,,,,,,...,0.217175,,,,,,,,,
modest mouse,,,,,0.015935,,,,,0.030437,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
michal w. smith,,,,,,,,,,,...,,,,,,,,,,
群星,,,,,,,,,,,...,,,,,,,,,,
agalloch,,,,,,,,,,,...,,,,,,,,,,
meshuggah,,,,,,,,,,,...,,,,,,,,,,


In [None]:
ratings_end_end = ratings_end.fillna(0)
ratings_end_end.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
flo rida,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(ratings_end_end)
labels = kmeans.labels_
unique, counts = np.unique(labels, return_counts=True)
cluster_sizes = dict(zip(unique, counts))

print(cluster_sizes)



Размеры кластеров: {0: 996, 1: 1, 2: 1, 3: 1, 4: 1}


Разбиение на кластеры получилось достаточно бесполезным. Большая часть объектов попала в один кластер. Возможно это произошло из-за дисбаланса классов или шума в данных. Может недостаточно было разбивать на 5 кластеров, нужно больше.

Макаров Иван 9 группа