In [33]:
#Encoding categorical values
#Encoding means to convert objecttype to some number type values for our model to work on
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()
df.shape

(205, 26)

In [34]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()#Including only coloumns with type object

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [9]:
obj_df = obj_df.fillna({"num_doors": "four"})#num doors have nan values and we arefilling it with 4 for simplicity

In [11]:
#We can replace all numbers written as words into integers using replace function
cleanup_nums = {"num_doors":     {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()(#Numdoors and num_cylinders are converted)

  obj_df = obj_df.replace(cleanup_nums)


Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


In [47]:
#Label Encoding
#The body style contains 5diff values,we can assign a number for each value
obj_df["body_style"] = obj_df["body_style"].astype('category')
obj_df.dtypes#Converting a column to a category and using those values

make                 object
fuel_type            object
aspiration           object
num_doors            object
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders        object
fuel_system          object
dtype: object

In [51]:
obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
obj_df.iloc[:10,:]#Cat codes is a predefined fn doing thejob

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,0
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,0
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,2
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi,3
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi,3
5,audi,gas,std,two,sedan,fwd,front,ohc,five,mpfi,3
6,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi,3
7,audi,gas,std,four,wagon,fwd,front,ohc,five,mpfi,4
8,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi,3
9,audi,gas,turbo,two,hatchback,4wd,front,ohc,five,mpfi,2


In [59]:
#One hot encoding
#Basic strategy is to convert each category value into a new column and assigns a 1 or 0 (True/False) value to the column.
pd.get_dummies(obj_df, columns=["drive_wheels"],dtype=int).head()#Default dtype is bool

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,two,convertible,front,dohc,four,mpfi,0,0,0,1
1,alfa-romero,gas,std,two,convertible,front,dohc,four,mpfi,0,0,0,1
2,alfa-romero,gas,std,two,hatchback,front,ohcv,six,mpfi,2,0,0,1
3,audi,gas,std,four,sedan,front,ohc,four,mpfi,3,0,1,0
4,audi,gas,std,four,sedan,front,ohc,five,mpfi,3,1,0,0


In [63]:
#Encoding more than 1 column
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"],dtype=int).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,two,front,dohc,four,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,two,front,dohc,four,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,two,front,ohcv,six,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,four,front,ohc,four,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,four,front,ohc,five,mpfi,3,0,0,0,1,0,1,0,0


In [65]:
#If we want only acertain value to have 1 and all other values 0,we use Custom Binary Encoding
obj_df["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, 0)#Checking wether string contains ohc which vcan also be a substring

In [75]:
obj_df[["make", "engine_type", "OHC_Code"]].iloc[:60,:]

Unnamed: 0,make,engine_type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1
5,audi,ohc,1
6,audi,ohc,1
7,audi,ohc,1
8,audi,ohc,1
9,audi,ohc,1


In [79]:
#Skicit version of label encoding
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
obj_df["make_code"] = ord_enc.fit_transform(obj_df[["make"]])
obj_df[["make", "make_code"]].head(11)

Unnamed: 0,make,make_code
0,alfa-romero,0.0
1,alfa-romero,0.0
2,alfa-romero,0.0
3,audi,1.0
4,audi,1.0
5,audi,1.0
6,audi,1.0
7,audi,1.0
8,audi,1.0
9,audi,1.0


In [81]:
#Skicit version ofone hot encoding
from sklearn.preprocessing import OneHotEncoder

oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(obj_df[["body_style"]])
pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
