# Imports


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
used_cars = pd.read_csv("data/cars.csv")

# Categorical pitfalls

In [3]:
used_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   color              38531 non-null  object 
 4   odometer_value     38531 non-null  int64  
 5   year_produced      38531 non-null  int64  
 6   engine_fuel        38531 non-null  object 
 7   engine_has_gas     38531 non-null  bool   
 8   engine_type        38531 non-null  object 
 9   engine_capacity    38521 non-null  float64
 10  body_type          38531 non-null  object 
 11  has_warranty       38531 non-null  bool   
 12  state              38531 non-null  object 
 13  drivetrain         38531 non-null  object 
 14  price_usd          38531 non-null  float64
 15  is_exchangeable    38531 non-null  bool   
 16  location_region    385

In [4]:
used_cars['manufacturer_name'].describe()

count          38531
unique            55
top       Volkswagen
freq            4243
Name: manufacturer_name, dtype: object

In [5]:
used_cars['color'] = used_cars['color'].astype('category')
used_cars['color'] = used_cars['color'].cat.set_categories(['black', 'silver', 'blue'])
used_cars['color'].value_counts(dropna=False)


color
NaN       18172
black      7705
silver     6852
blue       5802
Name: count, dtype: int64

In [6]:
# used_cars['number_of_photos'] = used_cars['number_of_photos'].astype('category')
# used_cars['number_of_photos'].sum()
# NumPy no trabaja con datos categóricos

# Label encoding

In [7]:
used_cars['manufacturer_name'] = used_cars['manufacturer_name'].astype('category')
used_cars['manufacturer_code'] = used_cars['manufacturer_name'].cat.codes
name_map= dict(zip(used_cars['manufacturer_code'],used_cars['manufacturer_name']))

In [8]:
name_map

{45: 'Subaru',
 24: 'LADA',
 12: 'Dodge',
 54: 'УАЗ',
 23: 'Kia',
 35: 'Opel',
 53: 'Москвич',
 1: 'Alfa Romeo',
 0: 'Acura',
 10: 'Dacia',
 27: 'Lexus',
 33: 'Mitsubishi',
 25: 'Lancia',
 9: 'Citroen',
 32: 'Mini',
 21: 'Jaguar',
 38: 'Porsche',
 44: 'SsangYong',
 11: 'Daewoo',
 15: 'Geely',
 50: 'ВАЗ',
 13: 'Fiat',
 14: 'Ford',
 39: 'Renault',
 42: 'Seat',
 40: 'Rover',
 48: 'Volkswagen',
 28: 'Lifan',
 22: 'Jeep',
 5: 'Cadillac',
 2: 'Audi',
 52: 'ЗАЗ',
 47: 'Toyota',
 51: 'ГАЗ',
 49: 'Volvo',
 7: 'Chevrolet',
 16: 'Great Wall',
 4: 'Buick',
 37: 'Pontiac',
 29: 'Lincoln',
 18: 'Hyundai',
 34: 'Nissan',
 46: 'Suzuki',
 3: 'BMW',
 30: 'Mazda',
 26: 'Land Rover',
 20: 'Iveco',
 43: 'Skoda',
 41: 'Saab',
 19: 'Infiniti',
 6: 'Chery',
 17: 'Honda',
 31: 'Mercedes-Benz',
 36: 'Peugeot',
 8: 'Chrysler'}

In [9]:
used_cars['manufacturer_code'].map(name_map)

0          Subaru
1          Subaru
2          Subaru
3          Subaru
4          Subaru
           ...   
38526    Chrysler
38527    Chrysler
38528    Chrysler
38529    Chrysler
38530    Chrysler
Name: manufacturer_code, Length: 38531, dtype: object

In [10]:
used_cars['van_code'] = np.where(
    used_cars['body_type'].str.contains('van',regex=False),1,0
)
used_cars['van_code'].value_counts(dropna=False)


van_code
0    34115
1     4416
Name: count, dtype: int64

# one hot coding


pd.get_dummies(data= df, columns = ["column_name"], prefix = "column_name")

In [14]:
used_cars_onehot = pd.get_dummies(used_cars, columns=['color'], prefix='')

used_cars_onehot

Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,...,feature_6,feature_7,feature_8,feature_9,duration_listed,manufacturer_code,van_code,_black,_silver,_blue
0,Subaru,Outback,automatic,190000,2010,gasoline,False,gasoline,2.5,universal,...,False,True,True,True,16,45,0,False,True,False
1,Subaru,Outback,automatic,290000,2002,gasoline,False,gasoline,3.0,universal,...,False,False,False,True,83,45,0,False,False,True
2,Subaru,Forester,automatic,402000,2001,gasoline,False,gasoline,2.5,suv,...,False,False,True,True,151,45,0,False,False,False
3,Subaru,Impreza,mechanical,10000,1999,gasoline,False,gasoline,3.0,sedan,...,False,False,False,False,86,45,0,False,False,True
4,Subaru,Legacy,automatic,280000,2001,gasoline,False,gasoline,2.5,universal,...,False,False,False,True,7,45,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,290000,2000,gasoline,False,gasoline,3.5,sedan,...,False,False,True,True,301,8,0,False,True,False
38527,Chrysler,PT Cruiser,mechanical,321000,2004,diesel,False,diesel,2.2,hatchback,...,False,False,True,True,317,8,0,False,False,True
38528,Chrysler,300,automatic,777957,2000,gasoline,False,gasoline,3.5,sedan,...,False,False,True,True,369,8,0,False,False,True
38529,Chrysler,PT Cruiser,mechanical,20000,2001,gasoline,False,gasoline,2.0,minivan,...,False,False,False,True,490,8,1,True,False,False
