In [21]:
# steps to do ML:
# 1. Select data
# 2. Preprocess data - formatting, cleaning
# 3. Transform data - scaling, normalization, etc.

In [22]:
import pandas as pd
import numpy as np

In [27]:
# Data handling/ preprocessing

headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"] 

df = pd.read_csv("data/imports-85.data", names=headers, header=None, sep=',',na_values='?')
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [28]:
df.shape

(205, 26)

In [29]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [30]:
df.nunique()

symboling              6
normalized_losses     51
make                  22
fuel_type              2
aspiration             2
num_doors              2
body_style             5
drive_wheels           3
engine_location        2
wheel_base            53
length                75
width                 44
height                49
curb_weight          171
engine_type            7
num_cylinders          7
engine_size           44
fuel_system            8
bore                  38
stroke                36
compression_ratio     32
horsepower            59
peak_rpm              23
city_mpg              29
highway_mpg           30
price                186
dtype: int64

In [31]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,volvo,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,volvo,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,volvo,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [33]:
obj_df.isnull().sum()

make               0
fuel_type          0
aspiration         0
num_doors          2
body_style         0
drive_wheels       0
engine_location    0
engine_type        0
num_cylinders      0
fuel_system        0
dtype: int64

In [34]:
# How to identify the records where there are missing values

obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [39]:
obj_df['num_doors'].unique()

array(['two', 'four'], dtype=object)

In [41]:
obj_df['num_cylinders'].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num_cylinders, dtype: int64

In [38]:
obj_df = obj_df.fillna({"num_doors": "four"})

In [42]:
# Apprach 1: find and replace

clean = {
    "num_doors": {
        "four": 4,
        "two": 2
    },
    "num_cylinders": {
        "four": 4,
        "six": 6,
        "twelve": 12,
        "eight": 8,
        "five": 5,
        "three": 3,
        "two": 2
    }
}

obj_df = obj_df.replace(clean)

In [44]:
obj_df.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

In [45]:
obj_df['body_style'].unique()

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

In [46]:
# Approach 2: Label encoding

obj_df['body_style']=obj_df['body_style'].astype('category')
obj_df['body_style_cat']=obj_df['body_style'].cat.codes # Categorical encoding
obj_df['body_style_cat'].unique()

array([0, 2, 3, 4, 1], dtype=int8)

In [47]:
obj_df.dtypes

make                 object
fuel_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
body_style_cat         int8
dtype: object

In [48]:
# Approach 3: One-hot encoding

obj_df['drive_wheels'].unique()

array(['rwd', 'fwd', '4wd'], dtype=object)

In [49]:
pd.get_dummies(obj_df, columns=['drive_wheels'])

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,0,1
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,2,0,0,1
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,3,0,1,0
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,4,sedan,front,ohc,4,mpfi,3,0,0,1
201,volvo,gas,turbo,4,sedan,front,ohc,4,mpfi,3,0,0,1
202,volvo,gas,std,4,sedan,front,ohcv,6,mpfi,3,0,0,1
203,volvo,diesel,turbo,4,sedan,front,ohc,6,idi,3,0,0,1


In [50]:
pd.get_dummies(obj_df, columns=['body_style','drive_wheels'],prefix=['body','drive'])

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,0,1
201,volvo,gas,turbo,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,0,1
202,volvo,gas,std,4,front,ohcv,6,mpfi,3,0,0,0,1,0,0,0,1
203,volvo,diesel,turbo,4,front,ohc,6,idi,3,0,0,0,1,0,0,0,1
