In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression

# Загрузка данных

In [3]:
data = pd.read_excel('data/data_ford_price.xlsx') 
data.head()

Unnamed: 0,price,year,condition,cylinders,odometer,title_status,transmission,drive,size,lat,long,weather
0,43900,2016,4,6,43500,clean,automatic,4wd,full-size,36.4715,-82.4834,59.0
1,15490,2009,2,8,98131,clean,automatic,4wd,full-size,40.468826,-74.281734,52.0
2,2495,2002,2,8,201803,clean,automatic,4wd,full-size,42.477134,-82.949564,45.0
3,1300,2000,1,8,170305,rebuilt,automatic,4wd,full-size,40.764373,-82.349503,49.0
4,13865,2010,3,8,166062,clean,automatic,4wd,,49.210949,-123.11472,


### Предобработка данных

In [None]:
data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]
data.dropna(inplace = True)

## Разделение данных на тестовую и валидационную выборки

In [None]:
y = data['price']
x = data.drop(columns='price')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

## Метод рекурсивного исключения признаков

In [None]:
# Train model
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
# Get features 
print(f'Main features: {selector.get_feature_names_out()}')

# Predict data
y_predicted = selector.predict(X_test)

# Evaluation of the model
mae = mean_absolute_error(y_test, y_predicted)
print(f'MAE with RFE method: {round(mae, 2)}')

Main features: ['year' 'cylinders' 'lat']
MAE with RFE method: 5096.57


### Метод выбора k лучших переменных: SelectKBest

In [None]:
# Selector
selector = SelectKBest(f_regression, k=3)
selector.fit(X_train, y_train)
 
# Get features 
main_features = selector.get_feature_names_out()
print(f'Main features: {main_features}')

# Train model
model = LinearRegression()
model.fit(X_train[main_features], y_train)

# Predict data
y_predicted = model.predict(X_test[main_features])

# Evaluation of the model
mae = mean_absolute_error(y_test, y_predicted)
print(f'MAE with SelectKBest method: {round(mae, 2)}')

Main features: ['year' 'cylinders' 'odometer']
MAE with SelectKBest method: 4708.95


### Вывод:


Метод выбора k лучших переменных SelectKBest показал лучшие результаты по сравнению с методом рекурсивного исключения признаков. Данный метод выбрал признаки, которые оказались наиболее полезны для предсказательной способности модели.