In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Подключаем датасет

In [5]:
df = pd.read_csv('/content/drive/MyDrive/cars.csv')
print(df.head(5))

  manufacturer_name model_name transmission   color  odometer_value  \
0            Subaru    Outback    automatic  silver          190000   
1            Subaru    Outback    automatic    blue          290000   
2            Subaru   Forester    automatic     red          402000   
3            Subaru    Impreza   mechanical    blue           10000   
4            Subaru     Legacy    automatic   black          280000   

   year_produced engine_fuel  engine_has_gas engine_type  engine_capacity  \
0           2010    gasoline           False    gasoline              2.5   
1           2002    gasoline           False    gasoline              3.0   
2           2001    gasoline           False    gasoline              2.5   
3           1999    gasoline           False    gasoline              3.0   
4           2001    gasoline           False    gasoline              2.5   

   ... feature_1  feature_2 feature_3 feature_4  feature_5  feature_6  \
0  ...      True       True      True

### Удаление ненужных и пустых столбцов

In [6]:
columns_to_drop = ['engine_has_gas','body_type','state','drivetrain','color','model_name','engine_fuel','manufacturer_name','engine_capacity','location_region','transmission','engine_type','feature_0','feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'duration_listed']
df = df.drop(columns=columns_to_drop, axis=1)
df.dropna(inplace=True)

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


### Кодирование текстовых значений

In [7]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


Предполагаем, что столбец is_exchangeable - это метки классов

In [8]:
X = df.drop(columns=['is_exchangeable'])
y = df['is_exchangeable']

Разбиваем данные на обучающий и тестовый наборы

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Стандартизация данных

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Функция для выполнения GridSearchCV и вывода метрик

In [11]:
def evaluate_model_with_gridsearch(model, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    print(f"Лучшие параметры для {model.__class__.__name__}: {grid_search.best_params_}")

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    print(f'Сравнение метрик для {model.__class__.__name__}: \n{classification_report(y_test, y_pred)}\n')
    print(f'Матрица ошибок для {model.__class__.__name__}: \n{confusion_matrix(y_test, y_pred)}\n')


Определение гиперпараметров для CatBoostClassifier


In [12]:
param_grid_catboost = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1]
}

Оценка CatBoostClassifier

In [13]:
evaluate_model_with_gridsearch(CatBoostClassifier(silent=True), param_grid_catboost, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Лучшие параметры для CatBoostClassifier: {'depth': 6, 'iterations': 100, 'learning_rate': 0.1}
Сравнение метрик для CatBoostClassifier: 
              precision    recall  f1-score   support

       False       0.70      0.90      0.79      5067
        True       0.59      0.28      0.38      2640

    accuracy                           0.69      7707
   macro avg       0.65      0.59      0.58      7707
weighted avg       0.66      0.69      0.65      7707


Матрица ошибок для CatBoostClassifier: 
[[4548  519]
 [1906  734]]

