# Построим baseline

## Импорт библиотек

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from preprocess import CustomPreprocessor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import Lasso,LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_squared_error, mean_absolute_error,r2_score)
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder

## Считывание данных

In [5]:
df = pd.read_csv('vehicles.csv')

In [6]:
df_train,df_test = train_test_split(df,test_size=0.2,random_state=42)

## Удаляем из тренировочных данных выбросы и строки с большим количеством пропусков

In [7]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    # Удаляем строки с большим количеством пропусков
    df = df.dropna(thresh=int(df.shape[1]*0.7))
    
    # Жёсткая фильтрация по ценам и пробегу
    df = df[(df['price'] >= 100) & (df['price'] <= 500_000) & (df['odometer'] <= 600_000)]
    
    # Удаление выбросов по IQR для числовых колонок
    num_cols = df.select_dtypes(exclude='object').columns.tolist()
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        if IQR > 0:
            low = Q1 - 1.5*IQR
            high = Q3 + 1.5*IQR
            df = df[(df[col] >= low) & (df[col] <= high)]
    # удалим колонки с большим кол-во уник знач
    df.drop(columns=['id', 'url', 'image_url', 'region_url','VIN','county','description','posting_date','model'],inplace=True)
    return df


In [8]:
df_train_clean = clean_data(df_train)
df_test_clean = df_test.copy()  


num_cols = df_train_clean.select_dtypes(exclude='object').columns.tolist()
cat_cols = df_train_clean.select_dtypes(include='object').columns.tolist()

for col in num_cols:
    if df_train_clean[col].isna().all():
        df_train_clean[col] = 0
        df_test_clean[col] = 0
    else:
        median_value = df_train_clean[col].median()
        df_train_clean[col].fillna(median_value, inplace=True)
        df_test_clean[col].fillna(median_value, inplace=True)

for col in cat_cols:
    if df_train_clean[col].isna().all():
        df_train_clean[col] = "missing"
        df_test_clean[col] = "missing"
    else:
        mode_value = df_train_clean[col].mode()[0]
        df_train_clean[col].fillna(mode_value, inplace=True)
        df_test_clean[col].fillna(mode_value, inplace=True)


## Сравним несколько моделей и выберем наилучшую 

### Подготовим данные для обучения,сделав разбиение на train и test 

In [9]:
TARGET_COL = "price"
X = df_train_clean.drop(columns=[TARGET_COL])
y = df_train_clean[TARGET_COL]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## Для начало рассмотрим стандартную Линейную регрессию 

### Для начала закодируем наши категориальные признаки и отмаштабируем их(необходимо для работы с линейными моделями)

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_features = X_train.select_dtypes(exclude=["object"]).columns
cat_features = X_train.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

X_train_ready = preprocessor.fit_transform(X_train)
X_valid_ready = preprocessor.transform(X_valid)

In [11]:
model_regres = LinearRegression()
model_regres.fit(X_train_ready,y_train)
y_pred = model_regres.predict(X_valid_ready)

print('Метрики LinearRegression:')
print('MAE: ',mean_absolute_error(y_valid,y_pred))
print('RMSE: ',np.sqrt(mean_squared_error(y_valid,y_pred)))
print('r2: ',r2_score(y_valid,y_pred))


Метрики LinearRegression:
MAE:  5183.464605435763
RMSE:  7231.671278009319
r2:  0.677568695500973


## Далее рассмотрим DissisonTree и RandomForest

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 224111 entries, 249171 to 303383
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   region        224111 non-null  object 
 1   year          224111 non-null  float64
 2   manufacturer  224111 non-null  object 
 3   condition     224111 non-null  object 
 4   cylinders     224111 non-null  object 
 5   fuel          224111 non-null  object 
 6   odometer      224111 non-null  float64
 7   title_status  224111 non-null  object 
 8   transmission  224111 non-null  object 
 9   drive         224111 non-null  object 
 10  size          224111 non-null  object 
 11  type          224111 non-null  object 
 12  paint_color   224111 non-null  object 
 13  state         224111 non-null  object 
 14  lat           224111 non-null  float64
 15  long          224111 non-null  float64
dtypes: float64(4), object(12)
memory usage: 29.1+ MB


### Закодируем категориальные признаки

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

X_train_tree = X_train.copy()
X_valid_tree = X_valid.copy()

cat_features = X_train_tree.select_dtypes(include=["object", "category"]).columns

label_encoders = {}

for col in cat_features:
    le = LabelEncoder()
    
    combined = pd.concat([X_train_tree[col], X_valid_tree[col]], axis=0).astype(str)
    le.fit(combined)
    
    X_train_tree[col] = le.transform(X_train_tree[col].astype(str))
    X_valid_tree[col] = le.transform(X_valid_tree[col].astype(str))
    
    label_encoders[col] = le

print(f"Label Encoding завершён. Закодировано признаков: {len(cat_features)}")

✅ Label Encoding завершён. Закодировано признаков: 12
Готово для обучения деревьев (DecisionTree, RandomForest, XGBoost, и т.д.)


In [17]:
X_train_tree.shape

(224111, 16)

In [18]:
# DecisionTree
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train_tree, y_train)
y_pred_tree = model_tree.predict(X_valid_tree)

print('Метрики DecisionTreeRegressor:')
print('MAE: ', mean_absolute_error(y_valid, y_pred_tree))
print('RMSE: ', np.sqrt(mean_squared_error(y_valid, y_pred_tree)))
print('R2: ', r2_score(y_valid, y_pred_tree))

# RandomForest

model_rf = RandomForestRegressor(
    n_estimators=100,       # количество деревьев, можно 50 для быстрого теста
    max_depth=16,           # ограничение глубины, чтобы избежать переобучения
    min_samples_leaf=3,     # минимальное число объектов в листе
    max_features='sqrt',    # sqrt(num_features) на каждом разбиении
    n_jobs=-1,              # использовать все ядра CPU
    random_state=42,
    verbose=1               # вывод прогресса
)

# Обучение
model_rf.fit(X_train_tree, y_train)

# Предсказание
y_pred_rf = model_rf.predict(X_valid_tree)

# Метрикчя
print('Метрики RandomForestRegressor:')
print('MAE: ', mean_absolute_error(y_valid, y_pred_rf))
print('RMSE: ', np.sqrt(mean_squared_error(y_valid, y_pred_rf)))
print('R2: ', r2_score(y_valid, y_pred_rf))

Метрики DecisionTreeRegressor:
MAE:  2748.484554252319
RMSE:  5875.773138653037
R2:  0.7871420447179057


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s


Метрики RandomForestRegressor:
MAE:  3544.428494267032
RMSE:  5350.774348463149
R2:  0.8234803247460685


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished


### Рассмотрим градиентные бустинги

### Для начала обучим catboost

In [19]:
from catboost import CatBoostRegressor

cat_features = X_train.select_dtypes(include=["object"]).columns.tolist()

cat_boost_model = CatBoostRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=100
    )

cat_boost_model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid),
    use_best_model=True,
)

y_pred = cat_boost_model.predict(X_valid)

mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print("Метрики CatBoost:")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

0:	learn: 10093.0128210	test: 10056.5625518	best: 10056.5625518 (0)	total: 185ms	remaining: 3m 5s
200:	learn: 4114.4936024	test: 4087.1286707	best: 4087.1286707 (200)	total: 37.3s	remaining: 2m 28s
400:	learn: 3845.7082580	test: 3831.8859216	best: 3831.8859216 (400)	total: 1m 13s	remaining: 1m 50s
600:	learn: 3710.3115486	test: 3702.3866811	best: 3702.3866811 (600)	total: 1m 52s	remaining: 1m 14s
800:	learn: 3615.0904920	test: 3613.1718276	best: 3613.1718276 (800)	total: 2m 28s	remaining: 36.8s
999:	learn: 3543.3328589	test: 3544.9652863	best: 3544.9652863 (999)	total: 3m 3s	remaining: 0us

bestTest = 3544.965286
bestIteration = 999

Метрики CatBoost:
MAE: 3544.965287281182
RMSE: 5744.813048300605
R2: 0.79652472351504


### Обучим ligthgbm

In [20]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Категориальные признаки как category
X_train_lgb = X_train.copy()
X_valid_lgb = X_valid.copy()
cat_features = X_train.select_dtypes(include=["object"]).columns.tolist()

for col in cat_features:
    X_train_lgb[col] = X_train_lgb[col].astype('category')
    X_valid_lgb[col] = X_valid_lgb[col].astype('category')

lgb_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1, 
    random_state=42,
    verbose=-1,
    early_stopping_rounds=50  
)

lgb_model.fit(
    X_train_lgb,
    y_train,
    eval_set=[(X_valid_lgb, y_valid)],
    eval_metric='mae',
    categorical_feature=cat_features 
)

y_pred = lgb_model.predict(X_valid_lgb)

mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print("Метрики LightGBM:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")

Метрики LightGBM:
MAE: 3268.4804
RMSE: 4875.1961
R2: 0.8535


### Далее рассмотрим xgboost

In [21]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Для XGBoost нужно закодировать категориальные признаки
X_train_xgb = X_train.copy()
X_valid_xgb = X_valid.copy()

# Кодируем категориальные признаки
label_encoders = {}
cat_features = X_train.select_dtypes(include=["object"]).columns.tolist()

for col in cat_features:
    le = LabelEncoder()
    # Объединяем train и valid для кодирования, чтобы избежать unseen labels
    combined = pd.concat([X_train[col], X_valid[col]], axis=0)
    le.fit(combined)
    X_train_xgb[col] = le.transform(X_train_xgb[col])
    X_valid_xgb[col] = le.transform(X_valid_xgb[col])
    label_encoders[col] = le

xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,  # В XGBoost обычно указывают конкретную глубину
    random_state=42,
    verbosity=0,  # Аналог verbose в XGBoost
    early_stopping_rounds=50,
    eval_metric='mae'
)

xgb_model.fit(
    X_train_xgb,
    y_train,
    eval_set=[(X_valid_xgb, y_valid)],
    verbose=False  # Отключаем вывод обучения
)

y_pred_xgb = xgb_model.predict(X_valid_xgb)

mae_xgb = mean_absolute_error(y_valid, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_valid, y_pred_xgb))
r2_xgb = r2_score(y_valid, y_pred_xgb)

print("Метрики XGBoost:")
print(f"MAE: {mae_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"R2: {r2_xgb:.4f}")

Метрики XGBoost:
MAE: 3260.7682
RMSE: 4880.1907
R2: 0.8532


### По итогу тестирования моделей наиболее высокие результаты показал RandomForestRegressor,а также LGBMRegressor и XGBRegressor