In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Подключаем датасет

In [3]:
df = pd.read_csv('/content/drive/MyDrive/cars.csv')
print(df.head(5))

  manufacturer_name model_name transmission   color  odometer_value  \
0            Subaru    Outback    automatic  silver          190000   
1            Subaru    Outback    automatic    blue          290000   
2            Subaru   Forester    automatic     red          402000   
3            Subaru    Impreza   mechanical    blue           10000   
4            Subaru     Legacy    automatic   black          280000   

   year_produced engine_fuel  engine_has_gas engine_type  engine_capacity  \
0           2010    gasoline           False    gasoline              2.5   
1           2002    gasoline           False    gasoline              3.0   
2           2001    gasoline           False    gasoline              2.5   
3           1999    gasoline           False    gasoline              3.0   
4           2001    gasoline           False    gasoline              2.5   

   ... feature_1  feature_2 feature_3 feature_4  feature_5  feature_6  \
0  ...      True       True      True

### Удаление ненужных и пустых столбцов

In [4]:
columns_to_drop = ['engine_has_gas','body_type','state','drivetrain','color','model_name','engine_fuel','manufacturer_name','engine_capacity','location_region','transmission','engine_type','feature_0','feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'duration_listed']
df = df.drop(columns=columns_to_drop, axis=1)
df.dropna(inplace=True)

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


### Кодирование текстовых значений

In [5]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


Предполагаем, что столбец is_exchangeable - это метки классов

In [6]:
X = df.drop(columns=['is_exchangeable'])
y = df['is_exchangeable']

Разбиваем данные на обучающий и тестовый наборы

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Стандартизация данных

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Обучение модели SGDClassifier

In [9]:
svm_sgdc = SGDClassifier()
svm_sgdc.fit(X_train, y_train)
svm_sgdc_pred = svm_sgdc.predict(X_test)

In [10]:
print(f'Сравнение метрик для SGDClassifier: \n{classification_report(y_test, svm_sgdc_pred)}\n')
print(f'Матрица ошибок для SGDClassifier: \n{confusion_matrix(y_test, svm_sgdc_pred)}\n')

Сравнение метрик для SGDClassifier: 
              precision    recall  f1-score   support

       False       0.67      0.99      0.80      5067
        True       0.73      0.05      0.09      2640

    accuracy                           0.67      7707
   macro avg       0.70      0.52      0.44      7707
weighted avg       0.69      0.67      0.56      7707


Матрица ошибок для SGDClassifier: 
[[5019   48]
 [2509  131]]



### Обучение модели LinearSVC с подбором гиперпараметров

In [11]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 2000, 3000]
}

grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5, n_jobs=-1, verbose=2)
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"GridSearchCV took {end_time - start_time:.2f} seconds")
print("Best parameters found:", grid_search.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
GridSearchCV took 236.97 seconds
Best parameters found: {'C': 100, 'max_iter': 3000}




### Предсказываем результат на тестовой выборке с лучшими параметрами

In [12]:
best_svc = grid_search.best_estimator_
svc_pred = best_svc.predict(X_test)

In [13]:
print(f'Сравнение метрик для LinearSVC: \n{classification_report(y_test, svc_pred)}\n')
print(f'Матрица ошибок для LinearSVC: \n{confusion_matrix(y_test, svc_pred)}\n')

Сравнение метрик для LinearSVC: 
              precision    recall  f1-score   support

       False       0.68      0.95      0.79      5067
        True       0.61      0.14      0.22      2640

    accuracy                           0.67      7707
   macro avg       0.64      0.54      0.51      7707
weighted avg       0.65      0.67      0.60      7707


Матрица ошибок для LinearSVC: 
[[4835  232]
 [2282  358]]

