# Обработка признаков

В этом домашнем задании вы будете решать задачу предсказания стоимости автомобилей по их различным характеристикам.

In [1]:
import pandas as pd
import numpy as np

RANDOM_STATE = 42

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/cars_prices.csv", decimal='.')

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### Описание некоторых признаков

`symboling` - rating corresponds to the degree to which the auto is more risky than its price indicates (+3 more risk and -3 is pretty safe)  
`make` - car types (i.e. car brand)  
`fuel-type` - types of fuel (gas or diesel)  
`aspiration` - engine aspiration (standard or turbo)  
`num-of-doors` - numbers of doors (two or four)  
`body-style` - car body style (sedan or hachback)  
`drive-wheels` - which types of drive wheel (forward-fwd, reversed-rwd)  
`engine-location` - engine mounted location (front or back)  
`wheel-base` - расстояние между осями передних и задних колес  
`length` - car lenght  
`weight` - car weight  
`width` - car width  
`height` - car height  

In [4]:
df.shape

(205, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

## Заполнение пропусков

Пропуски в этом датасете обозначены как `?`

In [6]:
for c in df.columns:
    print(c, len(df[df[c] == '?']))

symboling 0
normalized-losses 41
make 0
fuel-type 0
aspiration 0
num-of-doors 2
body-style 0
drive-wheels 0
engine-location 0
wheel-base 0
length 0
width 0
height 0
curb-weight 0
engine-type 0
num-of-cylinders 0
engine-size 0
fuel-system 0
bore 4
stroke 4
compression-ratio 0
horsepower 2
peak-rpm 2
city-mpg 0
highway-mpg 0
price 4


Удалите строки, для которых неизвестно значение price, так как это целевая переменная.

Для упрощения работы заменим все '?' на  None

In [7]:
# your code here
df.replace('?', value=np.nan, inplace=True)

In [8]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


## Вопрос для Quiz

Сколько строк осталось в данных?

In [9]:
# your code here
df.dropna(inplace=True, subset='price')
df.shape

(201, 26)

Заполните средним значением пропуски в столбцах для числовых признаков и самым популярным значением для категориальных признаков
* `num-of-doors`
* `bore`
* `stroke`
* `horsepower`
* `peak-rpm`

Категориальным является num-of-doors. Остальные числовые

In [10]:
# your code here
df['num-of-doors'].value_counts()

num-of-doors
four    113
two      86
Name: count, dtype: int64

Самое частое значение 'four'. Им и заполняем.

In [11]:
# your code here
df['num-of-doors'].fillna('four', inplace=True) 

In [12]:
# your code here
# change data types for columns
df[['bore', 'stroke', 'horsepower', 'peak-rpm']] = df[['bore', 'stroke', 'horsepower', 'peak-rpm']].astype('float64', copy=False)

In [13]:
# your code here
# calculate means
m_bore = df.bore.mean()
m_stroke = df.stroke.mean()
m_horsepower = df.horsepower.mean()
m_peak_rpm = df['peak-rpm'].mean()
print(m_peak_rpm)

5117.587939698493


In [14]:
df.bore.fillna(m_bore, inplace=True)
df.stroke.fillna(m_stroke, inplace=True)
df.horsepower.fillna(m_horsepower, inplace=True)
df['peak-rpm'].fillna(m_peak_rpm, inplace=True)

Проверяем, что пропуски заполнились

In [15]:
df.isna().sum()

symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          0
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  0
stroke                0
compression-ratio     0
horsepower            0
peak-rpm              0
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

## Вопрос для Quiz

Чему равно среднее значение `peak-rpm` до заполнения пропусков? Ответ округлите до целого числа.

Пропуски в столбце `normalized-losses` предскажите при помощи линейной регрессии по признакам
`symboling`, `wheel-base`, `length`, `width`, `height`, `curb-weight`, `engine-size`, `compression-ratio`, `city-mpg`, `highway-mpg` и заполните их предсказаниями

In [16]:
from sklearn.linear_model import LinearRegression

# your code here
ind_train = df[df['normalized-losses'].notna()].index
ind_test = df[df['normalized-losses'].isna()].index

X = df.loc[ind_train, ['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']]
y = df.loc[ind_train, ['normalized-losses']]

In [17]:
# your code here
fill_missed_model = LinearRegression()
fill_missed_model.fit(X, y)

In [18]:
X_test = df.loc[ind_test, ['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']]
pred = fill_missed_model.predict(X_test)
pred[0]

array([168.07249262])

In [19]:
count = 0
for i in ind_test:
    df.loc[i, ['normalized-losses']] = pred[count]
    count += 1

In [20]:
# проверяем, что все заполнено
df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

## Вопрос для Quiz

Чему равно предсказание линейной регрессии на первом пропущенном значении? Ответ округлите до целого числа.

## 2. Кодирование категориальных признаков

1. Закодируйте бинарные признаки `fuel-type`, `aspiration`, `num-of-doors`, `engine-location` каждый отдельной колонкой, состоящей из 0 и 1.
Единицей кодируйте самую частую категорию.

In [21]:
# your code here
# find most frequent categories
feat_list = ['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']
for f in feat_list:
    print(df[f].value_counts(), end='\n\n')

fuel-type
gas       181
diesel     20
Name: count, dtype: int64

aspiration
std      165
turbo     36
Name: count, dtype: int64

num-of-doors
four    115
two      86
Name: count, dtype: int64

engine-location
front    198
rear       3
Name: count, dtype: int64



In [22]:
# кодировку 1-более частый признак / 0 - менее частый реализуем через простой map

df['fuel-type'] = df['fuel-type'].map({'gas': 1, 'diesel': 0})
df['aspiration'] = df['aspiration'].map({'std': 1, 'turbo': 0})
df['num-of-doors'] = df['num-of-doors'].map({'four': 1, 'two': 0})
df['engine-location'] = df['engine-location'].map({'front': 1, 'rear': 0})

2. Вынесите в переменную `y` целевую переменную `price`, а все остальные колонки - в матрицу `X`.

Закодируйте признаки `make`, `body-style`, `engine-type`, `fuel-system` при помощи LeaveOneOutEncoder.

**Дальше все время работайте с объектами `X`, `y`.**

In [23]:
# your code here
# select to X all columns without last

X = df.iloc[:, :-1]
y = df['price']

In [24]:
# !pip install category_encoders -q

In [25]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

# your code here
loo_enc = LeaveOneOutEncoder()

X[['make', 'body-style', 'engine-type', 'fuel-system']] = loo_enc.fit_transform(X[['make', 'body-style', 'engine-type', 'fuel-system']], y)

## Вопрос для Quiz

Чему равно среднее значение в столбце `body-style` после кодирования? Ответ округлите до целого числа.

In [26]:
round(X['body-style'].mean(), 0)

13207.0

3. Закодируйте признак `drive-wheels` при помощи OHE из библиотеки category_encoders.

In [27]:
#from category_encoders.one_hot import OneHotEncoder

# your code here
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto', drop='first')
feature_arr = ohe.fit_transform(X['drive-wheels'].values.reshape(-1, 1)).toarray()
feature_labels = ohe.get_feature_names_out()

features = pd.DataFrame(feature_arr, index=X.index, columns=feature_labels)

In [28]:
features.shape

(201, 2)

In [29]:
X.drop('drive-wheels', axis=1, inplace=True)
X = pd.concat([X, features], axis=1)
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,x0_fwd,x0_rwd
0,3,168.072493,16500.0,1,1,0,23569.6,1,88.6,168.8,...,17650.307692,3.47,2.68,9.0,111.0,5000.0,21,27,0.0,1.0
1,3,168.072493,14997.5,1,1,0,22968.6,1,88.6,168.8,...,17617.285714,3.47,2.68,9.0,111.0,5000.0,21,27,0.0,1.0
2,1,134.001799,14997.5,1,1,0,9859.791045,1,94.5,171.2,...,17617.285714,2.68,3.47,9.0,154.0,5000.0,19,26,0.0,1.0
3,2,164.0,18641.0,1,1,1,14465.236559,1,99.8,176.6,...,17645.307692,3.19,3.4,10.0,102.0,5500.0,24,30,1.0,0.0
4,2,164.0,17941.0,1,1,1,14427.602151,1,99.4,176.6,...,17606.846154,3.19,3.4,8.0,115.0,5500.0,18,22,0.0,0.0


4. В столбце `num-of-cylinders` категории упорядочены по смыслу. Закодируйте их подряд идущими числами, начиная с 1, согласно смыслу.

Подряд идущими числами означает - 1, 2, 3 и так далее без пропусков.

In [30]:
# your code here

from category_encoders.ordinal import OrdinalEncoder

ord_enc = OrdinalEncoder(cols=['num-of-cylinders'])
X = ord_enc.fit_transform(X)

X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,x0_fwd,x0_rwd
0,3,168.072493,16500.0,1,1,0,23569.6,1,88.6,168.8,...,17650.307692,3.47,2.68,9.0,111.0,5000.0,21,27,0.0,1.0
1,3,168.072493,14997.5,1,1,0,22968.6,1,88.6,168.8,...,17617.285714,3.47,2.68,9.0,111.0,5000.0,21,27,0.0,1.0
2,1,134.001799,14997.5,1,1,0,9859.791045,1,94.5,171.2,...,17617.285714,2.68,3.47,9.0,154.0,5000.0,19,26,0.0,1.0
3,2,164.0,18641.0,1,1,1,14465.236559,1,99.8,176.6,...,17645.307692,3.19,3.4,10.0,102.0,5500.0,24,30,1.0,0.0
4,2,164.0,17941.0,1,1,1,14427.602151,1,99.4,176.6,...,17606.846154,3.19,3.4,8.0,115.0,5500.0,18,22,0.0,0.0


## Вопрос для Quiz

Сколько столбцов получилось в матрице `X`?

In [31]:
X['normalized-losses'] = X['normalized-losses'].astype(float)
X['bore'] = X['bore'].astype(float)
X['stroke'] = X['stroke'].astype(float)
X['horsepower'] = X['horsepower'].astype(float)
X['peak-rpm'] = X['peak-rpm'].astype(float)

y = y.astype(float)

Разбейте данные на тренировочную и тестовую часть в пропорции 3 к 1, зафиксируйте random_state = 42.

In [32]:
from sklearn.model_selection import train_test_split

# your code here
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state = 42)

Масштабируйте данные при помощи MinMaxScaler.

Обучайте масштабирование на тренировочных данных, а потом примените и к трейну, и к тесту.

In [33]:
from sklearn.preprocessing import MinMaxScaler

# your code here
scaler = MinMaxScaler()
Xtrain = scaler.fit_transform(Xtrain)

Xtest = scaler.transform(Xtest)

Обучите на тренировочных данных линейную регрессию, сделайте предсказание на тесте и вычислите значение $R^2$ на тестовых данных.

In [34]:
# your code here
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(Xtrain, ytrain)

## Вопрос для Quiz

Чему равно значение $R^2$ на тестовых данных? Ответ округлите до сотых.

In [36]:
from sklearn.metrics import r2_score

prediction = model.predict(Xtest)
r2_test = r2_score(ytest, prediction)

print(r2_test)

0.9133432212029311
