In [91]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, r_regression

In [92]:
%%capture
!wget https://www.dropbox.com/s/64ol9q9ssggz6f1/data_ford_price.xlsx

In [93]:
data = pd.read_excel('data_ford_price.xlsx') 

In [94]:
data.head()

Unnamed: 0,price,year,condition,cylinders,odometer,title_status,transmission,drive,size,lat,long,weather
0,43900,2016,4,6,43500,clean,automatic,4wd,full-size,36.4715,-82.4834,59.0
1,15490,2009,2,8,98131,clean,automatic,4wd,full-size,40.468826,-74.281734,52.0
2,2495,2002,2,8,201803,clean,automatic,4wd,full-size,42.477134,-82.949564,45.0
3,1300,2000,1,8,170305,rebuilt,automatic,4wd,full-size,40.764373,-82.349503,49.0
4,13865,2010,3,8,166062,clean,automatic,4wd,,49.210949,-123.11472,


In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7017 entries, 0 to 7016
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         7017 non-null   int64  
 1   year          7017 non-null   int64  
 2   condition     7017 non-null   int64  
 3   cylinders     7017 non-null   int64  
 4   odometer      7017 non-null   int64  
 5   title_status  7017 non-null   object 
 6   transmission  7017 non-null   object 
 7   drive         6626 non-null   object 
 8   size          5453 non-null   object 
 9   lat           7017 non-null   float64
 10  long          7017 non-null   float64
 11  weather       6837 non-null   float64
dtypes: float64(3), int64(5), object(4)
memory usage: 658.0+ KB


In [96]:
new_data = data[['price','year', 'cylinders', 'odometer', 'lat', 'long', 'weather']]


In [97]:
new_data = new_data.dropna()

In [98]:
y = new_data['price']
X = new_data.drop(columns='price')


In [99]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42, test_size=0.2)

In [100]:
model = LinearRegression()

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
#Рассчитываем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test, y_test_pred)))

#Рассчитываем MAE
print('MAE score: {:.3f}  $'.format(metrics.mean_absolute_error(y_test, y_test_pred)))

R2 score: 0.605
MAE score: 4863.525  $


# RFE

In [101]:
# Выделим три наиболее значимых признака
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

array(['year', 'cylinders', 'lat'], dtype=object)

In [102]:
# Оставим только данные по найденным признакам
X_1_train = X_train[selector.get_feature_names_out()]
X_1_test = X_test[selector.get_feature_names_out()]

In [103]:
model_1 = LinearRegression()
# Обучим и предскажем с помощью линейной регрессии
model_1.fit(X_1_train, y_train)
y_test_pred = model_1.predict(X_1_test)
#Рассчитываем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test, y_test_pred)))

#Рассчитываем MAE
print('MAE score: {:.3f}  $'.format(metrics.mean_absolute_error(y_test, y_test_pred)))


R2 score: 0.562
MAE score: 5280.017  $


#   МЕТОДЫ ВЫБОРА ПРИЗНАКОВ НА ОСНОВЕ ФИЛЬТРОВ

In [104]:
# Выделим три наиболее значимых признака
selector_2 = SelectKBest(r_regression, k=3)
selector_2.fit(X_train, y_train)
 
selector_2.get_feature_names_out()

array(['year', 'lat', 'long'], dtype=object)

In [105]:
# Оставим только данные по найденным признакам
X_2 = X_train[selector_2.get_feature_names_out()]
X_2_test = X_test[selector_2.get_feature_names_out()]

In [106]:
model_2 = LinearRegression()

# Обучим и предскажем с помощью линейной регрессии
model_2.fit(X_2, y_train)
y_pred = model_2.predict(X_2_test)

#Рассчитываем MAE
print('MAE score: {:.3f}  $'.format(metrics.mean_absolute_error(y_test, y_pred)))
#Рассчитываем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test, y_pred)))

MAE score: 5369.592  $
R2 score: 0.549


### Вывод:
По метрике детерминации метод рекурсивного исключения признаков показал более приемлимый результат.