In [26]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Загрузка данных

In [27]:
%%capture
!wget https://www.dropbox.com/s/64ol9q9ssggz6f1/data_ford_price.xlsx

"wget" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [28]:
data = pd.read_excel('data/data_ford_price.xlsx') 

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7017 entries, 0 to 7016
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         7017 non-null   int64  
 1   year          7017 non-null   int64  
 2   condition     7017 non-null   int64  
 3   cylinders     7017 non-null   int64  
 4   odometer      7017 non-null   int64  
 5   title_status  7017 non-null   object 
 6   transmission  7017 non-null   object 
 7   drive         7017 non-null   object 
 8   size          7017 non-null   object 
 9   lat           7017 non-null   float64
 10  long          7017 non-null   float64
 11  weather       7017 non-null   float64
dtypes: float64(3), int64(5), object(4)
memory usage: 658.0+ KB


In [48]:
# Заполняем пропущенные данные с помощью метода fillna
data['drive'] = data['drive'].fillna(data['drive'].value_counts().index[0])
data['size'] = data['size'].fillna(data['size'].value_counts().index[0])
data['weather'] = data['weather'].fillna(data['weather'].median())

#  Отбор признаков: мотивация

## Предобработка данных

Давайте оценим влияние мультиколлинеарности на линейную регрессию:

In [60]:
data.dropna(inplace = True)

y = data['price']
x = data.drop(columns='price')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

## Обучение модели

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4682.957


## Удаление избыточного признака

In [6]:
x.drop('lat', axis = 1, inplace = True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
 
mae = mean_absolute_error(y_test, y_predicted)
print('MAE: %.3f' % mae)

MAE: 4672.930


#  Отбор признаков: классификация методов

## Метод рекурсивного исключения признаков

In [61]:
from sklearn.feature_selection import RFE

In [10]:
y = data['price']
x = data.drop(columns='price')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [62]:
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
 
selector.get_feature_names_out()

ValueError: could not convert string to float: 'salvage'

In [20]:
print('Три наиболее значимых признака по RFE:', selector.get_feature_names_out())

Три наиболее значимых признака по RFE: ['year' 'cylinders' 'lat']


In [21]:
x.columns

Index(['year', 'cylinders', 'odometer', 'lat', 'long', 'weather'], dtype='object')

##  МЕТОДЫ ВЫБОРА ПРИЗНАКОВ НА ОСНОВЕ ФИЛЬТРОВ

In [23]:
from sklearn.feature_selection import SelectKBest, f_regression

In [25]:
selector = SelectKBest(f_regression, k=3)
selector.fit(X_train, y_train)
 
selector.get_feature_names_out()
print('Три наиболее значимых признака на основе фильтров SelectKBest:', selector.get_feature_names_out())

Три наиболее значимых признака на основе фильтров SelectKBest: ['year' 'cylinders' 'odometer']
