In [228]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [153]:
df = pd.read_csv('cars.csv')

In [154]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [156]:
categorical_cols = [cols for cols in df.columns if df[cols].dtype == 'object']
categorical_cols = df[categorical_cols]
categorical_cols.head()

Unnamed: 0,brand,fuel,owner
0,Maruti,Diesel,First Owner
1,Skoda,Diesel,Second Owner
2,Honda,Petrol,Third Owner
3,Hyundai,Diesel,First Owner
4,Maruti,Petrol,First Owner


In [157]:
for cols in categorical_cols.columns:
    unique_counts = df[cols].nunique()
    unique_values = df[cols].unique()
    print(f"No. of unique values in {cols} columns: {unique_counts}")
    print(f"List of unique values in {cols} columns: {unique_values}")
    print("---------------------------------------------------------")

No. of unique values in brand columns: 32
List of unique values in brand columns: ['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata' 'Chevrolet' 'Fiat' 'Datsun' 'Jeep' 'Mercedes-Benz' 'Mitsubishi'
 'Audi' 'Volkswagen' 'BMW' 'Nissan' 'Lexus' 'Jaguar' 'Land' 'MG' 'Volvo'
 'Daewoo' 'Kia' 'Force' 'Ambassador' 'Ashok' 'Isuzu' 'Opel' 'Peugeot']
---------------------------------------------------------
No. of unique values in fuel columns: 4
List of unique values in fuel columns: ['Diesel' 'Petrol' 'LPG' 'CNG']
---------------------------------------------------------
No. of unique values in owner columns: 5
List of unique values in owner columns: ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner'
 'Test Drive Car']
---------------------------------------------------------


#### OneHotEncoding using pandas get_dummies() a built-in method in pandas

In [159]:
pd.get_dummies(categorical_cols, columns=["fuel", "owner"])

Unnamed: 0,brand,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,False,True,False,False,True,False,False,False,False
1,Skoda,False,True,False,False,False,False,True,False,False
2,Honda,False,False,False,True,False,False,False,False,True
3,Hyundai,False,True,False,False,True,False,False,False,False
4,Maruti,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,False,False,False,True,True,False,False,False,False
8124,Hyundai,False,True,False,False,False,True,False,False,False
8125,Maruti,False,True,False,False,True,False,False,False,False
8126,Tata,False,True,False,False,True,False,False,False,False


#### Dropping 1 columns (K - 1) columns using OneHotEncoding

In [161]:
pd.get_dummies(categorical_cols, columns=["fuel", "owner"], drop_first=True, dtype=int)

Unnamed: 0,brand,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,1,0,0,0,0,0,0
1,Skoda,1,0,0,0,1,0,0
2,Honda,0,0,1,0,0,0,1
3,Hyundai,1,0,0,0,0,0,0
4,Maruti,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
8123,Hyundai,0,0,1,0,0,0,0
8124,Hyundai,1,0,0,1,0,0,0
8125,Maruti,1,0,0,0,0,0,0
8126,Tata,1,0,0,0,0,0,0


#### If we want binary values like 0 for false and 1 for true, we use dtype=int

### The problem with this method is that does not "remember" the column structure. This can be problemtaic during training machine learning. 

#### It creates one-hot encoded columns based on the categories present in the dataset.
#### If a category is missing in the test data (or new data), pd.get_dummies() won't create the same set of columns as it did for the training data.
#### Conversely, if the test data contains a new category, pd.get_dummies() will throw an error because it doesn't know how to handle it.

#### So better approach to encode Ordinal categories is to use scikit-learn OneHotEncoder

In [166]:
df.head(2)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000


<h3 style='color: blue'>OneHotEncoding</h4>

In [168]:
# first select the feature and target
y = df.selling_price
X = df.iloc[:, 0:4]

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [188]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [296]:
ohe = OneHotEncoder(sparse_output=False, drop='first', dtype='int32')
encoded_X_train = ohe.fit_transform(X_train[["fuel", "owner"]])
encoded_X_test = ohe.transform(X_test[["fuel", "owner"]])

In [298]:
encoded_X_train

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [256]:
X_train[["brand", "km_driven"]].values

array([['Hyundai', 60000],
       ['Tata', 150000],
       ['Hyundai', 110000],
       ...,
       ['Hyundai', 90000],
       ['Volkswagen', 90000],
       ['Hyundai', 110000]], dtype=object)

In [300]:
encoded_df = np.hstack((X_train[["brand", "km_driven"]].values, encoded_X_train))

In [312]:
encoded_df

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

#### OneHotEncoding with high no. of categories

In [337]:
counts = df["brand"].value_counts()

In [327]:
df.brand.nunique()
threshold = 100

In [349]:
repl = counts[counts <= threshold].index
pd.get_dummies(df.brand.replace(repl, "Uncommon"), dtype='int32')

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Uncommon,Volkswagen
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
