In [34]:
import pandas as pd
import numpy as np

In [35]:
df = pd.read_csv("CAR_DETAILS_FROM_CAR_DEKHO.csv" , usecols=["name","seller_type","transmission","owner","fuel"])

In [36]:
df.head()

Unnamed: 0,name,fuel,seller_type,transmission,owner
0,Maruti 800 AC,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,Diesel,Individual,Manual,Second Owner


# Categorical Data

### Categorical data means data that represents categories (groups/labels) instead of numbers.

### Examples:

#### Fuel type → Petrol, Diesel, CNG, Electric
#### Pen type → Gel, Ballpoint, Fountain
#### Transmission → Manual, Automatic
#### Owner → First, Second, Third

### Types of categorical data:

#### Nominal (no order) → e.g., Fuel (Petrol, Diesel, CNG)
#### Ordinal (with order) → e.g., Owner (First, Second, Third)

# OrdinalEncoder

### Where it is used?
#### For input features (X) that are categorical.

### What it does?
#### Converts each category into a unique number.

#### Example: Transmission → {Manual: 0, Automatic: 1}
#### Owner → {First: 0, Second: 1, Third: 2}

#### Note: Used on multiple columns of input features.

# LabelEncoder

### Where it is used?
#### Mostly for the target variable (y) that is categorical.

### What it does?
#### Converts target categories into numbers.

#### Example: Fuel → {Petrol: 0, Diesel: 1, CNG: 2, Electric: 3}

#### Note: Applied on a single column (the output/label).

# Convert Categories into number using Python

In [37]:
df.drop("name",axis = 1 , inplace = True)

In [23]:
# df.head()

In [24]:
# df['seller_type'].unique()

In [25]:
# df["seller_type"].value_counts()

In [26]:
# df["seller_type"] = df["seller_type"].map({"Individual":0,"Dealer":1,"Trustmark Dealer":2})

In [27]:
# df.head()

In [28]:
# df.sample(5) #Random 5 rows

In [29]:
# df["tansmission"] = df["transmission"].apply(lambda x : 1 if x == "Manual" else 0)

In [30]:
# df.sample(5)

In [38]:
df.head()

Unnamed: 0,fuel,seller_type,transmission,owner
0,Petrol,Individual,Manual,First Owner
1,Petrol,Individual,Manual,First Owner
2,Diesel,Individual,Manual,First Owner
3,Petrol,Individual,Manual,First Owner
4,Diesel,Individual,Manual,Second Owner


# Using Scikitlearn

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X = df.drop("fuel" , axis = 1)
y = df["fuel"]

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

In [42]:
from sklearn.preprocessing import LabelEncoder , OrdinalEncoder

# Apply LabelEncoder

In [66]:
le = LabelEncoder()

In [74]:
y_train_traf = le.fit_transform(y_train)
y_test_traf = le.transform(y_test)

In [75]:
y_train_traf

array([1, 1, 4, ..., 1, 1, 4])

In [76]:
# y_train_traf  = pd.DataFrame(data = y_train_traf , columns = ["Fuel"]) 

y_train_traf  = pd.DataFrame(data = y_train_traf , columns = [y_train.name]) 

In [77]:
y_train_traf

Unnamed: 0,fuel
0,1
1,1
2,4
3,1
4,1
...,...
3467,1
3468,1
3469,1
3470,1


# Applying OrdinalEncoder

In [71]:
oe = OrdinalEncoder()

X_train_traf = oe.fit_transform(X_train)
X_test_traf = oe.transform(X_test)

In [73]:
X_train.head(3)

Unnamed: 0,seller_type,transmission,owner
227,Individual,Manual,First Owner
964,Individual,Manual,First Owner
2045,Individual,Manual,Second Owner


In [78]:
X_train_traf

array([[1., 1., 0.],
       [1., 1., 0.],
       [1., 1., 2.],
       ...,
       [0., 1., 0.],
       [1., 1., 0.],
       [1., 1., 0.]])

In [79]:
X_train_traf = pd.DataFrame(data = X_train_traf , columns = X_train.columns)

In [81]:
X_train_traf.head(3)

Unnamed: 0,seller_type,transmission,owner
0,1.0,1.0,0.0
1,1.0,1.0,0.0
2,1.0,1.0,2.0
