In [25]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

In [5]:
pip install lightgbm -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
df = pd.read_csv('base_dt.csv')
df.head(3)

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,riders,target_reg,target_class,total_car_age,servis_car_age,...,max_car_speed,count_car_rides,upquart_car_speed,start_car_devnorm,driver_range,active_ride,accident_rate,worker_experience,work_meanduration_car,month_top_carfix
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,76163,108.53,another_bug,8,2,...,180.855726,169.0,105.0,0.0,1097.0,0.729018,0.007975,275.685714,26.657143,4.0
1,O41613818T,VW Polo VI,economy,petrol,3.9,78218,35.2,electro_bug,8,2,...,187.862734,174.0,109.660112,-7.057,1187.5,0.694342,0.006158,264.628571,24.942857,1.0
2,d-2109686j,Renault Sandero,standart,petrol,6.3,23340,38.62,gear_stick,11,6,...,102.382857,165.0,74.0,0.0,1206.0,0.75817,0.007472,268.314286,26.142857,3.0


In [5]:
df.drop(columns=['car_id'], inplace=True, errors = 'ignore') # Удаляю признак car_id, т.к. он не несет смысловой нагрузки
df['month_top_carfix'] = df['month_top_carfix'].astype(str) # Представляет собой нумерацию месяцев от 1 до 12, что является категориальным признаком
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2337 entries, 0 to 2336
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   model                  2337 non-null   object 
 1   car_type               2337 non-null   object 
 2   fuel_type              2337 non-null   object 
 3   car_rating             2337 non-null   float64
 4   riders                 2337 non-null   int64  
 5   target_reg             2337 non-null   float64
 6   target_class           2337 non-null   object 
 7   total_car_age          2337 non-null   int64  
 8   servis_car_age         2337 non-null   int64  
 9   shift_car_devnorm      2337 non-null   float64
 10  min_car_rating         2337 non-null   float64
 11  mean_car_rating        2337 non-null   float64
 12  sum_car_distance       2337 non-null   float64
 13  max_car_speed          2337 non-null   float64
 14  count_car_rides        2337 non-null   float64
 15  upqu

In [6]:
df['total_car_age'] = df['total_car_age'].apply(lambda x: 0 if x<9 else 1)

drop_cols = ['target_class', 'target_reg']
cat_cols = ['car_type', 'fuel_type', 'model', 'month_top_carfix']
df[cat_cols] = df[cat_cols].astype('category')

In [7]:
# Переведем категориальные признаки в one-hot
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder()
encoded_features = one_hot_encoder.fit_transform(df[cat_cols]).toarray()

encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(cat_cols))
df_enc = df.drop(cat_cols, axis=1).join(encoded_df)

In [8]:
X = df_enc.drop(drop_cols, axis=1, errors = 'ignore')
y = df_enc['target_class'].fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder

num_classes = len(set(y))
le = LabelEncoder()
y_dum = le.fit_transform(y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_dum, test_size=0.2,
                                                  random_state=43, stratify=y_dum)

### RandomForest

In [11]:
rf = RandomForestClassifier(n_estimators=3, random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)

In [12]:
y_rf = rf.predict(X_test)
prob_rf = rf.predict_proba(X_test) # Предсказание для Hard voting

accuracy = rf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.92735


### Catboost

In [13]:
cb = CatBoostClassifier(random_state=42, thread_count=-1)
cb.fit(X_train, y_train, 
          eval_set=(X_test, y_test),
          verbose=100, plot=False, 
          early_stopping_rounds=100)

Learning rate set to 0.109335
0:	learn: 1.6541843	test: 1.6650642	best: 1.6650642 (0)	total: 65.1ms	remaining: 1m 5s
100:	learn: 0.0803422	test: 0.1062886	best: 0.1062886 (100)	total: 1.43s	remaining: 12.7s
200:	learn: 0.0497370	test: 0.0935299	best: 0.0935299 (200)	total: 2.78s	remaining: 11.1s
300:	learn: 0.0331827	test: 0.0904235	best: 0.0904212 (299)	total: 4.13s	remaining: 9.59s
400:	learn: 0.0239081	test: 0.0861753	best: 0.0861753 (400)	total: 5.63s	remaining: 8.41s
500:	learn: 0.0189486	test: 0.0854528	best: 0.0852715 (489)	total: 6.95s	remaining: 6.92s
600:	learn: 0.0153044	test: 0.0852271	best: 0.0846015 (547)	total: 8.28s	remaining: 5.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.08460148289
bestIteration = 547

Shrink model to first 548 iterations.


<catboost.core.CatBoostClassifier at 0x7f275384abc0>

In [14]:
y_cb = cb.predict(X_test)
prob_cb = cb.predict_proba(X_test) # Предсказание для Hard voting

accuracy = cb.score(X_test, y_test)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.97222


### LightGBM

In [15]:
lg = LGBMClassifier(random_state=43,
        objective="multiclass",
        boosting_type="goss",
        n_estimators=200,
        n_jobs=-1,
        verbose=-1,
)

lg.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's multi_logloss: 0.0936281


In [16]:
y_lg = lg.predict(X_test)
prob_lg = lg.predict_proba(X_test) # Предсказание для Hard voting

accuracy = lg.score(X_test, y_test)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.96581


### XGBClassifier

In [17]:
xb = xgb.XGBClassifier(
    random_state=43,
    objective="multi:softmax",  # Используем multi:softmax для мультиклассовой классификации
    booster="dart",  # Используем метод dart
    n_estimators=200,
    n_jobs=-1,
    verbosity=1,
    num_class=num_classes
)

xb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=True,
)

[0]	validation_0-mlogloss:1.18179
[1]	validation_0-mlogloss:0.84759
[2]	validation_0-mlogloss:0.64229
[3]	validation_0-mlogloss:0.50025
[4]	validation_0-mlogloss:0.39818
[5]	validation_0-mlogloss:0.32110


[6]	validation_0-mlogloss:0.26542
[7]	validation_0-mlogloss:0.22289
[8]	validation_0-mlogloss:0.18969
[9]	validation_0-mlogloss:0.16507
[10]	validation_0-mlogloss:0.14696
[11]	validation_0-mlogloss:0.13180
[12]	validation_0-mlogloss:0.12032
[13]	validation_0-mlogloss:0.11230
[14]	validation_0-mlogloss:0.10514
[15]	validation_0-mlogloss:0.10165
[16]	validation_0-mlogloss:0.09785
[17]	validation_0-mlogloss:0.09631
[18]	validation_0-mlogloss:0.09537
[19]	validation_0-mlogloss:0.09326
[20]	validation_0-mlogloss:0.09267
[21]	validation_0-mlogloss:0.09245
[22]	validation_0-mlogloss:0.09201
[23]	validation_0-mlogloss:0.09179
[24]	validation_0-mlogloss:0.09159
[25]	validation_0-mlogloss:0.09096
[26]	validation_0-mlogloss:0.09054
[27]	validation_0-mlogloss:0.09020
[28]	validation_0-mlogloss:0.09013
[29]	validation_0-mlogloss:0.08966
[30]	validation_0-mlogloss:0.08980
[31]	validation_0-mlogloss:0.09046
[32]	validation_0-mlogloss:0.09045
[33]	validation_0-mlogloss:0.09036
[34]	validation_0-mloglo

In [18]:
y_xb = xb.predict(X_test)
prob_xb = xb.predict_proba(X_test) # Предсказание для Hard voting

accuracy = xb.score(X_test, y_test)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.97222


#### xb=97222, cb=97222, lg=96581, rf=92735 - почти в ногу, лишь rf отстает

### Soft voting

In [22]:
xbc = xgb.XGBClassifier(random_state=43, objective="multi:softmax", booster="dart", n_estimators=200, n_jobs=-1, num_class=num_classes)
lgb = LGBMClassifier(random_state=43, objective="multiclass", boosting_type="goss", n_estimators=45, n_jobs=-1, verbose=-1)
cbc = CatBoostClassifier(random_state=42, learning_rate=0.109335, thread_count=-1)
rfc = RandomForestClassifier(n_estimators=3, random_state=42, n_jobs=-1)

weights = [0.15, 0.25, 0.3, 0.3] # акцентирование на лучшей моделе показал лучший скор, чем использование пропорций (_share)
models = [('rf', rfc), ('lgbm', lgb), ('cb', cbc), ('xb', xbc)]

voting = VotingClassifier(estimators=models, weights=weights, voting='soft')

voting.fit(X_train, y_train)

y_preds = voting.predict(X_test)

print(accuracy_score(y_test, y_preds))

0:	learn: 1.6541843	total: 19.1ms	remaining: 19.1s
1:	learn: 1.3986818	total: 33.3ms	remaining: 16.6s
2:	learn: 1.2175685	total: 47.1ms	remaining: 15.6s
3:	learn: 1.0588690	total: 61.4ms	remaining: 15.3s
4:	learn: 0.9268687	total: 75.5ms	remaining: 15s
5:	learn: 0.8236112	total: 89.2ms	remaining: 14.8s
6:	learn: 0.7505952	total: 103ms	remaining: 14.6s
7:	learn: 0.6803395	total: 116ms	remaining: 14.4s
8:	learn: 0.6214425	total: 130ms	remaining: 14.3s
9:	learn: 0.5717934	total: 143ms	remaining: 14.2s
10:	learn: 0.5250349	total: 157ms	remaining: 14.1s
11:	learn: 0.4876329	total: 171ms	remaining: 14.1s
12:	learn: 0.4519976	total: 184ms	remaining: 14s
13:	learn: 0.4224098	total: 197ms	remaining: 13.9s
14:	learn: 0.3939456	total: 211ms	remaining: 13.9s
15:	learn: 0.3716349	total: 226ms	remaining: 13.9s
16:	learn: 0.3471706	total: 240ms	remaining: 13.9s
17:	learn: 0.3291114	total: 254ms	remaining: 13.8s
18:	learn: 0.3104016	total: 267ms	remaining: 13.8s
19:	learn: 0.2930066	total: 281ms	remai

#### Скор немного вырос. В соло лучший скор составил 0.97222, а ансамбль soft voting (ручная настройка весов)- 0.97863
### Далее Hard voting

In [18]:
import optuna

optuna.logging.set_verbosity(optuna.logging.ERROR)
def objective(trial): # Функция для расчета оптимального коэффициента

    a = trial.suggest_float('a', 0, 1)
    b = trial.suggest_float('b', 0, 1 - a)
    c = trial.suggest_float('c', 0, 1 - a - b)
    d = 1 - a - b - c
    
    voting_predictions_proba = a * prob_rf + b * prob_cb + c * prob_lg + d * prob_xb
    voting_predictions = np.argmax(voting_predictions_proba, axis=1)
    
    score = accuracy_score(y_test, voting_predictions)
    
    return score

In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_weights = study.best_params
best_weights

{'a': 0.2568955724748905, 'b': 0.18386626073960094, 'c': 0.15641856988177824}

In [22]:
best_a = best_weights['a']
best_b = best_weights['b']
best_c = best_weights['c']
best_d = 1 - best_a - best_b - best_c

voting_predictions_proba = best_a * prob_rf + best_b * prob_cb + best_c * prob_lg + best_d * prob_xb # Получаем взвешенное предсказание вероятностей
voting_predictions = np.argmax(voting_predictions_proba, axis=1) # Извлекаем наиболее вероятное предсказание
final_score = accuracy_score(y_test, voting_predictions)

print(f'Точность окончательной модели: {final_score}')

Точность окончательной модели: 0.9764957264957265


#### Soft voting с ручной настройкой весов показал лучший скор 0.97863, а Hard voting - 0.97649

In [38]:
from scipy.stats import mode

y_cb = y_cb.ravel() # Преобразование в 1D массив, catboost выдает предсказание в виде 2 мерного массива
predictions = np.vstack((y_rf, y_cb, y_lg, y_xb))
hard_voting_predictions, _ = mode(predictions, axis=0) # Предсказание методом выбора самого частого

accuracy_score(y_test, hard_voting_predictions)

0.9700854700854701

#### Бленд решением большинства показал скор меньший 0.97008 по сравнению с предыдущими экспериментами
#### Далее эксперимент: использую веса, расчитанные оптуной, для soft voting

In [23]:
xbc = xgb.XGBClassifier(random_state=43, objective="multi:softmax", booster="dart", n_estimators=200, n_jobs=-1, num_class=num_classes)
lgb = LGBMClassifier(random_state=43, objective="multiclass", boosting_type="goss", n_estimators=45, n_jobs=-1, verbose=-1)
cbc = CatBoostClassifier(random_state=42, learning_rate=0.109335, thread_count=-1)
rfc = RandomForestClassifier(n_estimators=3, random_state=42, n_jobs=-1)

weights = [best_a, best_c, best_b, best_d]
models = [('rf', rfc), ('lgbm', lgb), ('cb', cbc), ('xb', xbc)]

voting = VotingClassifier(estimators=models, weights=weights, voting='soft')

voting.fit(X_train, y_train)

y_preds = voting.predict(X_test)

print(accuracy_score(y_test, y_preds))

0:	learn: 1.6541843	total: 22.6ms	remaining: 22.5s
1:	learn: 1.3986818	total: 38.7ms	remaining: 19.3s
2:	learn: 1.2175685	total: 54.1ms	remaining: 18s
3:	learn: 1.0588690	total: 69.6ms	remaining: 17.3s
4:	learn: 0.9268687	total: 83.1ms	remaining: 16.5s
5:	learn: 0.8236112	total: 97ms	remaining: 16.1s
6:	learn: 0.7505952	total: 111ms	remaining: 15.7s
7:	learn: 0.6803395	total: 124ms	remaining: 15.4s
8:	learn: 0.6214425	total: 138ms	remaining: 15.1s
9:	learn: 0.5717934	total: 151ms	remaining: 14.9s
10:	learn: 0.5250349	total: 164ms	remaining: 14.7s
11:	learn: 0.4876329	total: 177ms	remaining: 14.6s
12:	learn: 0.4519976	total: 191ms	remaining: 14.5s
13:	learn: 0.4224098	total: 204ms	remaining: 14.4s
14:	learn: 0.3939456	total: 217ms	remaining: 14.2s
15:	learn: 0.3716349	total: 231ms	remaining: 14.2s
16:	learn: 0.3471706	total: 246ms	remaining: 14.2s
17:	learn: 0.3291114	total: 259ms	remaining: 14.1s
18:	learn: 0.3104016	total: 272ms	remaining: 14s
19:	learn: 0.2930066	total: 285ms	remaini

#### Эксперимент применения луших весов, расчитаных optuna, в ансамбле soft voting дал скор 0.97649, что меньше, чем soft voting (ручная настройка весов)- 0.97863