In [111]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [62]:
data = pd.read_csv("update_auto_complectations.csv")
print('количество классов: ', data[data.columns[0]].unique().size)

количество классов:  37


## Разбиваем на тренировочную (70% данных) и тестовую выборку (30% данных)

In [63]:
data_train, data_test = train_test_split(data, random_state=1337, test_size=0.3)

X_data_train = data_train.drop(['type_car'], axis=1)
y_data_train = data_train['type_car']

X_data_test = data_test.drop(['type_car'], axis=1)
y_data_test = data_test['type_car']

### Обучение модели KNeighborsClassifier

In [64]:
# Инициализируем модель
model = KNeighborsClassifier(n_neighbors=1)

# Обучаем модель
model.fit(X_data_train, y_data_train)

# Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9373776908023483

### Обучение модели DecisionTreeClassifier

In [65]:
# Инициализируем модель
model = DecisionTreeClassifier(random_state=42,
                               criterion='gini',
                               max_depth=250,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               class_weight=None,
                              )

# Обучаем модель
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9843444227005871

### Обучение модели LinearDiscriminantAnalysis

In [66]:
model = LinearDiscriminantAnalysis(solver='svd',
                                   shrinkage=None,
                                   store_covariance=0,
                                   tol=10e-100
                                  )
# обучаем модель
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9784735812133072

### Обучение модели SVM

In [67]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9843444227005871

## Попытаемся улучшить модель. Вспомним, что пустые значения заполняли нулями. Заполним их средним значением по столбцу.

In [68]:
data.loc[data['разгон_до_сотни'] == 0, 'Разгон до сотни'] = data['разгон_до_сотни'].mean()

### Обучение модели KNeighborsClassifier

In [72]:
# Инициализируем модель
model = KNeighborsClassifier(n_neighbors=1)

# Обучаем модель
model.fit(X_data_train, y_data_train)

# Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9373776908023483

### Обучение модели DecisionTreeClassifier

In [75]:
# Инициализируем модель
model = DecisionTreeClassifier(random_state=42,
                               criterion='gini',
                               max_depth=500,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               class_weight=None,
                              )

# Обучаем модель
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9843444227005871

### Обучение модели LinearDiscriminantAnalysis

In [70]:
model = LinearDiscriminantAnalysis(solver='svd',
                                   shrinkage=None,
                                   store_covariance=0,
                                   tol=10e-100
                                  )
# обучаем модель
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9784735812133072

### Обучение модели SVM

In [71]:
model = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model.fit(X_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(X_data_test)
accuracy_score(pred, y_data_test)

0.9843444227005871

Наибольший результат accuracy = 0.9843444227005871. Это очень хороший результат, учитывая, что было 37 классов.
Были попытки улучшить результат через более правдоподобное значение признаков. Результат не изменился

## Посмотрим на важность признаков

In [88]:
variables_importance = pd.DataFrame({'feature': X_data_train.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)
variables_importance.head(30)

Unnamed: 0,feature,importance
15,высота_машины,0.26926
13,длина_машины,0.262935
16,колёсная_база,0.120682
17,объем_багажника,0.071365
19,объём_топливного_бака,0.06839
1,doors,0.040516
294,brand Toyota,0.022944
14,ширина_машины,0.021421
6,количество_ступеней,0.01831
18,объём_багажника_максимальный,0.017479


## Нормализуем данные, посмотрим, что получиться

In [94]:
df_data_train = preprocessing.normalize(X_data_train, axis=0)
df_data_test = preprocessing.normalize(X_data_test, axis=0)

scaled_df = pd.DataFrame(df_data_train, columns=X_data_train.columns)
scaled_df.head()

Unnamed: 0,seats,doors,seast,объем_двигателя,мощность_двигателя,крутящий_момент_двигателя,количество_ступеней,разгон_до_сотни,максимальная_скорость,расход_топлива_город,...,brand Volkswagen,brand Volvo,тип_двигателя бензиновый,тип_двигателя гибридный,тип_двигателя дизельный,тип_двигателя электрический,тип_коробки_передач автоматическая,тип_коробки_передач вариатор,тип_коробки_передач механическая,тип_коробки_передач роботизированная
0,0.028192,0.028972,0.028192,0.024035,0.018314,0.028661,0.037942,0.036064,0.027165,0.0,...,0.0,0.0,0.0,0.0,0.05726,0.0,0.036202,0.0,0.0,0.0
1,0.028192,0.028972,0.028192,0.02389,0.018192,0.015537,0.0,0.03542,0.026333,0.025575,...,0.0,0.0,0.033923,0.0,0.0,0.0,0.0,0.09759,0.0,0.0
2,0.028192,0.028972,0.028192,0.02401,0.030401,0.02753,0.033726,0.022862,0.030075,0.028383,...,0.0,0.0,0.033923,0.0,0.0,0.0,0.036202,0.0,0.0,0.0
3,0.028192,0.040561,0.028192,0.016712,0.015262,0.016593,0.02951,0.033166,0.025363,0.020897,...,0.094072,0.0,0.033923,0.0,0.0,0.0,0.036202,0.0,0.0,0.0
4,0.028192,0.028972,0.028192,0.030034,0.021366,0.017725,0.0,0.032844,0.027442,0.03119,...,0.0,0.0,0.033923,0.0,0.0,0.0,0.0,0.09759,0.0,0.0


### Обучение модели KNeighborsClassifier

In [102]:
# Инициализируем модель
model = KNeighborsClassifier(n_neighbors=1)

# Обучаем модель
model.fit(df_data_train, y_data_train)

# Предикт и проверка
pred = model.predict(df_data_test)
accuracy_score(pred, y_data_test)

0.9726027397260274

### Обучение модели DecisionTreeClassifier

In [104]:
# Инициализируем модель
model = DecisionTreeClassifier(random_state=42,
                               criterion='gini',
                               max_depth=500,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               class_weight=None,
                              )

# Обучаем модель
model.fit(df_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(df_data_test)
accuracy_score(pred, y_data_test)

0.06653620352250489

### Обучение модели LinearDiscriminantAnalysis

In [98]:
model = LinearDiscriminantAnalysis(solver='svd',
                                   shrinkage=None,
                                   store_covariance=0,
                                   tol=10e-100
                                  )
# обучаем модель
model.fit(df_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(df_data_test)
accuracy_score(pred, y_data_test)

0.9275929549902152

### Обучение модели SVM

In [108]:
# Инициализируем модель
model = make_pipeline(StandardScaler(), SVC(gamma='auto'))

# Обучаем модель
model.fit(df_data_train, y_data_train)

#Предикт и проверка
pred = model.predict(df_data_test)
accuracy_score(pred, y_data_test)

0.12133072407045009

Как показывают результаты, нормализация не улучшает score