In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

Считываем данные.

In [62]:
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


Получаем информацию о типах переменных каждого столбца.

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized_losses    164 non-null float64
make                 205 non-null object
fuel_type            205 non-null object
aspiration           205 non-null object
num_doors            203 non-null object
body_style           205 non-null object
drive_wheels         205 non-null object
engine_location      205 non-null object
wheel_base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb_weight          205 non-null int64
engine_type          205 non-null object
num_cylinders        205 non-null object
engine_size          205 non-null int64
fuel_system          205 non-null object
bore                 201 non-null float64
stroke               201 non-null float64
compression_ratio    205 non-null float64
horsepower           203 non-

Смотрим описательные статистики.

In [64]:
df.describe()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
count,205.0,164.0,205.0,205.0,205.0,205.0,205.0,205.0,201.0,201.0,205.0,203.0,203.0,205.0,205.0,201.0
mean,0.834146,122.0,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329751,3.255423,10.142537,104.256158,5125.369458,25.219512,30.75122,13207.129353
std,1.245307,35.442168,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.273539,0.316717,3.97204,39.714369,479.33456,6.542142,6.886443,7947.066342
min,-2.0,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,0.0,94.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7775.0
50%,1.0,115.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,2.0,150.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.59,3.41,9.4,116.0,5500.0,30.0,34.0,16500.0
max,3.0,256.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


Отберем столбцы типа object.

In [65]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [66]:
obj_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 10 columns):
make               205 non-null object
fuel_type          205 non-null object
aspiration         205 non-null object
num_doors          203 non-null object
body_style         205 non-null object
drive_wheels       205 non-null object
engine_location    205 non-null object
engine_type        205 non-null object
num_cylinders      205 non-null object
fuel_system        205 non-null object
dtypes: object(10)
memory usage: 16.1+ KB


Смотрим, есть ли пропущенные значения в столбце "num_doors". Они есть, поэтому заменяем пропуски значением с наибольшей частотой.

In [67]:
set(obj_df["num_doors"])

{'four', nan, 'two'}

In [68]:
obj_df[obj_df["num_doors"].isna()]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [69]:
obj_df['num_doors'].value_counts() / len(obj_df)

four    0.556098
two     0.434146
Name: num_doors, dtype: float64

Найдем частоты - наибольшее значение у "four". Заменим пропуски на "four".

In [70]:
obj_df.loc[obj_df.num_doors.isna(), "num_doors"] = "four"
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,volvo,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,volvo,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,volvo,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [71]:
obj_df[obj_df["num_doors"].isna()]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system


Заменим числа-слова в столбцах num_doors и num_cylinders на числа.

In [72]:
#уникальные значения в num_doors
set(obj_df["num_doors"])

{'four', 'two'}

In [73]:
#уникальные значения в num_cylinders
set(obj_df["num_cylinders"])

{'eight', 'five', 'four', 'six', 'three', 'twelve', 'two'}

In [74]:
#словарь, в котором ключи - слова, а значения - числа
nums = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}

In [75]:
for word_num in nums.keys():
    obj_df.loc[obj_df.num_doors == word_num, "num_doors"] = nums[word_num]
    obj_df.loc[obj_df.num_cylinders == word_num, "num_cylinders"] = nums[word_num]

In [76]:
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,4,sedan,rwd,front,ohc,4,mpfi
201,volvo,gas,turbo,4,sedan,rwd,front,ohc,4,mpfi
202,volvo,gas,std,4,sedan,rwd,front,ohcv,6,mpfi
203,volvo,diesel,turbo,4,sedan,rwd,front,ohc,6,idi


Закодируем числами (кодирование меток) столбец body_style.

In [77]:
#до кодирования меток были такие уникальные значения в столбце body_style
set(obj_df.body_style)

{'convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'}

In [78]:
label_encoder = LabelEncoder()
label_encoder.fit(obj_df.body_style)
le_body_style = label_encoder.transform(obj_df.body_style)
le_body_style

array([0, 0, 2, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 2,
       2, 2, 2, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 3, 4, 2, 2, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 3, 2, 3,
       3, 3, 4, 1, 3, 3, 0, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 4, 3, 2, 3, 4, 1, 2, 3, 3, 4, 3, 2, 2, 2, 3, 3, 4,
       4, 3, 3, 4, 4, 3, 3, 3, 2, 2, 2, 3, 3, 4, 2, 2, 1, 1, 0, 2, 4, 2,
       2, 3, 2, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 2, 2, 2, 4,
       4, 4, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 1, 1, 2, 1, 2, 0, 3, 3, 2,
       3, 2, 2, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 4, 3, 4, 3, 4,
       3, 4, 3, 3, 3, 3, 3])

In [79]:
#после кодирования меток стали такие уникальные значения в столбце body_style
set(le_body_style)

{0, 1, 2, 3, 4}

Закодируем dummy-перем
енными столбец drive_wheels.

In [80]:
dummies_drive_wheels = pd.get_dummies(obj_df.drive_wheels)
dummies_drive_wheels

Unnamed: 0,4wd,fwd,rwd
0,0,0,1
1,0,0,1
2,0,0,1
3,0,1,0
4,1,0,0
...,...,...,...
200,0,0,1
201,0,0,1
202,0,0,1
203,0,0,1


Выполним двоичное кодирование столбца engine_type: если в автомобиле двигатель OHC, 0 - нет,  1 - да. И отоберем строки, если 1  (то есть "да").

In [81]:
pd.get_dummies(obj_df.engine_type)

Unnamed: 0,dohc,dohcv,l,ohc,ohcf,ohcv,rotor
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
200,0,0,0,1,0,0,0
201,0,0,0,1,0,0,0
202,0,0,0,0,0,1,0
203,0,0,0,1,0,0,0


In [82]:
dummies_engine_type = pd.get_dummies(obj_df.engine_type).ohc
dummies_engine_type

0      0
1      0
2      0
3      1
4      1
      ..
200    1
201    1
202    0
203    1
204    1
Name: ohc, Length: 205, dtype: uint8

In [83]:
obj_df[dummies_engine_type == 1]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi
5,audi,gas,std,2,sedan,fwd,front,ohc,5,mpfi
6,audi,gas,std,4,sedan,fwd,front,ohc,5,mpfi
7,audi,gas,std,4,wagon,fwd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
199,volvo,gas,turbo,4,wagon,rwd,front,ohc,4,mpfi
200,volvo,gas,std,4,sedan,rwd,front,ohc,4,mpfi
201,volvo,gas,turbo,4,sedan,rwd,front,ohc,4,mpfi
203,volvo,diesel,turbo,4,sedan,rwd,front,ohc,6,idi


Применим OrdinalEncoder для марки автомобиля (make).

In [84]:
ord_encoder = OrdinalEncoder()
obj_df.make = ord_encoder.fit_transform(obj_df[["make"]]).astype(int)
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,0,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,0,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,0,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,1,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,1,gas,std,4,sedan,4wd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,sedan,rwd,front,ohc,4,mpfi
201,21,gas,turbo,4,sedan,rwd,front,ohc,4,mpfi
202,21,gas,std,4,sedan,rwd,front,ohcv,6,mpfi
203,21,diesel,turbo,4,sedan,rwd,front,ohc,6,idi


Применим  OneHotEncoder  для  (body_style).

In [85]:
onehot_encoder = OneHotEncoder(sparse=False,categories='auto')
ohe_body_style = onehot_encoder.fit_transform(obj_df.body_style.values.reshape(-1, 1)).astype(int)
ohe_body_style

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

Соберем dataframe  с перекодировками.

In [87]:
df_recoded = obj_df
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,0,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,0,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,0,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,1,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,1,gas,std,4,sedan,4wd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,sedan,rwd,front,ohc,4,mpfi
201,21,gas,turbo,4,sedan,rwd,front,ohc,4,mpfi
202,21,gas,std,4,sedan,rwd,front,ohcv,6,mpfi
203,21,diesel,turbo,4,sedan,rwd,front,ohc,6,idi


Заменяем значения в body_style на значения, полученные в OneHotEncoder для body_style.

In [90]:
df_recoded.body_style = ohe_body_style
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,0,gas,std,2,1,rwd,front,dohc,4,mpfi
1,0,gas,std,2,1,rwd,front,dohc,4,mpfi
2,0,gas,std,2,0,rwd,front,ohcv,6,mpfi
3,1,gas,std,4,0,fwd,front,ohc,4,mpfi
4,1,gas,std,4,0,4wd,front,ohc,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,rwd,front,ohc,4,mpfi
201,21,gas,turbo,4,0,rwd,front,ohc,4,mpfi
202,21,gas,std,4,0,rwd,front,ohcv,6,mpfi
203,21,diesel,turbo,4,0,rwd,front,ohc,6,idi


Заменяем значения в engine_type на значения, полученные двоичным кодированием для engine_type.

In [92]:
df_recoded.engine_type = dummies_engine_type
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,0,gas,std,2,1,rwd,front,0,4,mpfi
1,0,gas,std,2,1,rwd,front,0,4,mpfi
2,0,gas,std,2,0,rwd,front,0,6,mpfi
3,1,gas,std,4,0,fwd,front,1,4,mpfi
4,1,gas,std,4,0,4wd,front,1,5,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,rwd,front,1,4,mpfi
201,21,gas,turbo,4,0,rwd,front,1,4,mpfi
202,21,gas,std,4,0,rwd,front,0,6,mpfi
203,21,diesel,turbo,4,0,rwd,front,1,6,idi


Заменяем значения в drive_wheels на значения, полученные dummy-кодированием столбца drive_wheels.

In [94]:
df_recoded = df_recoded.drop(columns=["drive_wheels"])
df_recoded = pd.concat([df_recoded,dummies_drive_wheels], axis=1)
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd
0,0,gas,std,2,1,front,0,4,mpfi,0,0,1
1,0,gas,std,2,1,front,0,4,mpfi,0,0,1
2,0,gas,std,2,0,front,0,6,mpfi,0,0,1
3,1,gas,std,4,0,front,1,4,mpfi,0,1,0
4,1,gas,std,4,0,front,1,5,mpfi,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,front,1,4,mpfi,0,0,1
201,21,gas,turbo,4,0,front,1,4,mpfi,0,0,1
202,21,gas,std,4,0,front,0,6,mpfi,0,0,1
203,21,diesel,turbo,4,0,front,1,6,idi,0,0,1


Добавим столбец price. Проверим, есть ли пропущенные значения, если есть - заменим средним по make.

In [135]:
df_recoded["price"] = df.price
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price
0,0,gas,std,2,1,front,0,4,mpfi,0,0,1,13495.0
1,0,gas,std,2,1,front,0,4,mpfi,0,0,1,16500.0
2,0,gas,std,2,0,front,0,6,mpfi,0,0,1,16500.0
3,1,gas,std,4,0,front,1,4,mpfi,0,1,0,13950.0
4,1,gas,std,4,0,front,1,5,mpfi,1,0,0,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,front,1,4,mpfi,0,0,1,16845.0
201,21,gas,turbo,4,0,front,1,4,mpfi,0,0,1,19045.0
202,21,gas,std,4,0,front,0,6,mpfi,0,0,1,21485.0
203,21,diesel,turbo,4,0,front,1,6,idi,0,0,1,22470.0


In [136]:
df_recoded[df_recoded.price.isna()]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price
9,1,gas,turbo,2,0,front,1,5,mpfi,1,0,0,
44,6,gas,std,2,0,front,1,4,2bbl,0,1,0,
45,6,gas,std,4,0,front,1,4,2bbl,0,1,0,
129,15,gas,std,2,0,front,0,8,mpfi,0,0,1,


In [137]:
df_recoded.loc[df_recoded.price.isna(), "price"] = np.round(np.mean(df_recoded.make), 2)
df_recoded[df_recoded.price.isna()]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price


In [138]:
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price
0,0,gas,std,2,1,front,0,4,mpfi,0,0,1,13495.0
1,0,gas,std,2,1,front,0,4,mpfi,0,0,1,16500.0
2,0,gas,std,2,0,front,0,6,mpfi,0,0,1,16500.0
3,1,gas,std,4,0,front,1,4,mpfi,0,1,0,13950.0
4,1,gas,std,4,0,front,1,5,mpfi,1,0,0,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,front,1,4,mpfi,0,0,1,16845.0
201,21,gas,turbo,4,0,front,1,4,mpfi,0,0,1,19045.0
202,21,gas,std,4,0,front,0,6,mpfi,0,0,1,21485.0
203,21,diesel,turbo,4,0,front,1,6,idi,0,0,1,22470.0


In [150]:
set(df_recoded.engine_location)

{'front', 'rear'}

Категоризуем переменную price, после чего отоберем самые дорогие машины.

In [139]:
df_recoded.price.describe()

count      205.000000
mean     12949.667317
std       8078.660027
min         12.200000
25%       7689.000000
50%      10198.000000
75%      16500.000000
max      45400.000000
Name: price, dtype: float64

Категоризуем следующим образом: дешевые машины - те, у которых цена ниже 1 квартиля, машины, стоимостью ниже среднего -  те, у которых цена выше 1 квартиля и ниже 2 квартиля,  машины, стоимостью выше среднего -  те, у которых цена выше 2 квартиля и ниже 3 квартиля, дорогие машины - те, у которых цена выше 3 квартиля.

In [140]:
#находим квантили 0, 0.25, 0.5, 0.75, 1 - 0 и 1, чтобы было удобнее в дальнейшем
quants = [df_recoded.price.quantile(i / 4) for i in range(0, 5)]
quants

[12.2, 7689.0, 10198.0, 16500.0, 45400.0]

In [141]:
ranks = ["cheap", "below_the_average", "above_the_average", "expensive"]
ranks

['cheap', 'below_the_average', 'above_the_average', 'expensive']

In [142]:
price_categ = pd.Series([None] * len(df_recoded))
price_categ

0      None
1      None
2      None
3      None
4      None
       ... 
200    None
201    None
202    None
203    None
204    None
Length: 205, dtype: object

In [144]:
for i in range(len(quants) - 1):
    price_categ[(quants[i] < df_recoded.price) & (df_recoded.price <= quants[i + 1])] = ranks[i]

In [147]:
df_recoded["price_categorized"] = price_categ
df_recoded

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price,price_categorized
0,0,gas,std,2,1,front,0,4,mpfi,0,0,1,13495.0,above_the_average
1,0,gas,std,2,1,front,0,4,mpfi,0,0,1,16500.0,above_the_average
2,0,gas,std,2,0,front,0,6,mpfi,0,0,1,16500.0,above_the_average
3,1,gas,std,4,0,front,1,4,mpfi,0,1,0,13950.0,above_the_average
4,1,gas,std,4,0,front,1,5,mpfi,1,0,0,17450.0,expensive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,21,gas,std,4,0,front,1,4,mpfi,0,0,1,16845.0,expensive
201,21,gas,turbo,4,0,front,1,4,mpfi,0,0,1,19045.0,expensive
202,21,gas,std,4,0,front,0,6,mpfi,0,0,1,21485.0,expensive
203,21,diesel,turbo,4,0,front,1,6,idi,0,0,1,22470.0,expensive


Отберем самые дорогие машины.

In [149]:
df_recoded[df_recoded.price_categorized == "expensive"]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,4wd,fwd,rwd,price,price_categorized
4,1,gas,std,4,0,front,1,5,mpfi,1,0,0,17450.0,expensive
6,1,gas,std,4,0,front,1,5,mpfi,0,1,0,17710.0,expensive
7,1,gas,std,4,0,front,1,5,mpfi,0,1,0,18920.0,expensive
8,1,gas,turbo,4,0,front,1,5,mpfi,0,1,0,23875.0,expensive
11,2,gas,std,4,0,front,1,4,mpfi,0,0,1,16925.0,expensive
12,2,gas,std,2,0,front,1,6,mpfi,0,0,1,20970.0,expensive
13,2,gas,std,4,0,front,1,6,mpfi,0,0,1,21105.0,expensive
14,2,gas,std,4,0,front,1,6,mpfi,0,0,1,24565.0,expensive
15,2,gas,std,4,0,front,1,6,mpfi,0,0,1,30760.0,expensive
16,2,gas,std,2,0,front,1,6,mpfi,0,0,1,41315.0,expensive
