In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge, LogisticRegression

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Подключаем датасет

In [3]:
df = pd.read_csv('/content/drive/MyDrive/cars.csv')
print(df.head(5))

  manufacturer_name model_name transmission   color  odometer_value  \
0            Subaru    Outback    automatic  silver          190000   
1            Subaru    Outback    automatic    blue          290000   
2            Subaru   Forester    automatic     red          402000   
3            Subaru    Impreza   mechanical    blue           10000   
4            Subaru     Legacy    automatic   black          280000   

   year_produced engine_fuel  engine_has_gas engine_type  engine_capacity  \
0           2010    gasoline           False    gasoline              2.5   
1           2002    gasoline           False    gasoline              3.0   
2           2001    gasoline           False    gasoline              2.5   
3           1999    gasoline           False    gasoline              3.0   
4           2001    gasoline           False    gasoline              2.5   

   ... feature_1  feature_2 feature_3 feature_4  feature_5  feature_6  \
0  ...      True       True      True

### Удаление ненужных и пустых столбцов

In [4]:
columns_to_drop = ['engine_has_gas','body_type','state','drivetrain','color','model_name','engine_fuel','manufacturer_name','engine_capacity','location_region','transmission','engine_type','feature_0','feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'duration_listed']
df = df.drop(columns=columns_to_drop, axis=1)
df.dropna(inplace=True)

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


### Кодирование текстовых значений

In [5]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


Предполагаем, что столбец is_exchangeable - это метки классов

In [6]:
X = df.drop(columns=['is_exchangeable'])
y = df['is_exchangeable']

Разбиваем данные на обучающий и тестовый наборы

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Стандартизация данных

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Параметры для GridSearchCV

In [9]:
param_grid_ridge = {
    'alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
}
param_grid_logistic = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [100, 200, 300]
}

### GridSearchCV для Ridge

In [10]:
ridge = Ridge()
grid_search_ridge = GridSearchCV(ridge, param_grid_ridge, cv=5, n_jobs=-1, verbose=2)
grid_search_ridge.fit(X_train, y_train)
print("Лучшие параметры для Ridge: ", grid_search_ridge.best_params_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Лучшие параметры для Ridge:  {'alpha': 100.0}


###  Предсказания и метрики для Ridge

In [11]:
best_ridge = grid_search_ridge.best_estimator_
ridge_pred = best_ridge.predict(X_test)
mae = mean_absolute_error(y_test, ridge_pred)
mse = mean_squared_error(y_test, ridge_pred)
print(f'Ridge MAE: {mae:.5f}')
print(f'Ridge MSE: {mse:.5f}')
print(f'Ridge RMSE: {np.sqrt(mse):.5f}')


Ridge MAE: 0.43236
Ridge MSE: 0.21285
Ridge RMSE: 0.46136


### GridSearchCV для LogisticRegression


In [12]:
logistic = LogisticRegression()
grid_search_logistic = GridSearchCV(logistic, param_grid_logistic, cv=5, n_jobs=-1, verbose=2)
grid_search_logistic.fit(X_train, y_train)
print("Лучшие параметры для LogisticRegression: ", grid_search_logistic.best_params_)


Fitting 5 folds for each of 15 candidates, totalling 75 fits
Лучшие параметры для LogisticRegression:  {'C': 0.01, 'max_iter': 100}


### Предсказания и метрики для LogisticRegression

In [13]:
best_logistic = grid_search_logistic.best_estimator_
logistic_pred = best_logistic.predict(X_test)
print(f'Сравнение метрик для LogisticRegression: \n{classification_report(y_test, logistic_pred)}\n')
print(f'Матрица ошибок для LogisticRegression: \n{confusion_matrix(y_test, logistic_pred)}\n')

Сравнение метрик для LogisticRegression: 
              precision    recall  f1-score   support

       False       0.67      0.97      0.79      5067
        True       0.60      0.10      0.17      2640

    accuracy                           0.67      7707
   macro avg       0.64      0.53      0.48      7707
weighted avg       0.65      0.67      0.58      7707


Матрица ошибок для LogisticRegression: 
[[4890  177]
 [2375  265]]

