In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge ,SGDRegressor, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Подключаем датасет

In [16]:
df = pd.read_csv('/content/drive/MyDrive/cars.csv')
print(df.head(5))

  manufacturer_name model_name transmission   color  odometer_value  \
0            Subaru    Outback    automatic  silver          190000   
1            Subaru    Outback    automatic    blue          290000   
2            Subaru   Forester    automatic     red          402000   
3            Subaru    Impreza   mechanical    blue           10000   
4            Subaru     Legacy    automatic   black          280000   

   year_produced engine_fuel  engine_has_gas engine_type  engine_capacity  \
0           2010    gasoline           False    gasoline              2.5   
1           2002    gasoline           False    gasoline              3.0   
2           2001    gasoline           False    gasoline              2.5   
3           1999    gasoline           False    gasoline              3.0   
4           2001    gasoline           False    gasoline              2.5   

   ... feature_1  feature_2 feature_3 feature_4  feature_5  feature_6  \
0  ...      True       True      True

### Удаление ненужных и пустых столбцов

In [17]:
columns_to_drop = ['engine_has_gas','body_type','state','drivetrain','color','model_name','engine_fuel','manufacturer_name','engine_capacity','location_region','transmission','engine_type','feature_0','feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'duration_listed']
df = df.drop(columns=columns_to_drop, axis=1)
df.dropna(inplace=True)

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


### Кодирование текстовых значений

In [18]:
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

print(df.head(5))

   odometer_value  year_produced  has_warranty  price_usd  is_exchangeable  \
0          190000           2010         False   10900.00            False   
1          290000           2002         False    5000.00             True   
2          402000           2001         False    2800.00             True   
3           10000           1999         False    9999.00             True   
4          280000           2001         False    2134.11             True   

   number_of_photos  up_counter  
0                 9          13  
1                12          54  
2                 4          72  
3                 9          42  
4                14           7  


Предполагаем, что столбец is_exchangeable - это метки классов

In [19]:
X = df.drop(columns=['is_exchangeable'])
y = df['is_exchangeable']

Разбиваем данные на обучающий и тестовый наборы

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Проверка обычной регрессии

Создание и обучение модели

In [21]:

regression_linear = LinearRegression()

regression_linear.fit(X_train, y_train)

Вывод результата

In [23]:
regression_linear_pred = regression_linear.predict(X_test)

mae = mean_absolute_error(y_test, regression_linear_pred)
mse = mean_squared_error(y_test, regression_linear_pred)

print(f'MAE: {mae:.5f}')
print(f'MSE: {mse:.5f}')
print(f'RMSE: {np.sqrt(mse):.5f}')

MAE: 0.43228
MSE: 0.21284
RMSE: 0.46135


Создание и обучение модели

In [24]:
regression_logistic = LogisticRegression()

regression_logistic.fit(X_train, y_train)

Вывод результата

In [25]:
regression_logistic_pred = regression_logistic.predict(X_test)

print(f'Сравнение метрик: \n{classification_report(y_test, regression_logistic_pred)}\n')
print(f'Матрица ошибок: \n{confusion_matrix(y_test, regression_logistic_pred)}\n')

Сравнение метрик: 
              precision    recall  f1-score   support

       False       0.66      0.98      0.79      5067
        True       0.53      0.04      0.07      2640

    accuracy                           0.66      7707
   macro avg       0.60      0.51      0.43      7707
weighted avg       0.62      0.66      0.54      7707


Матрица ошибок: 
[[4978   89]
 [2538  102]]

