In [35]:
#Import the necessary librari
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(style='darkgrid')

### Creating a dummy dataset

In [36]:
import pandas as pd
import numpy as np

# Create a list of unique car names
car_names = ["Honda", "Mercedes", "Ferrari", "Lamborgini"]

# Create a list of possible ratings
ratings = ["Good", "Average", "Bad", "Excellent"]

# Create a DataFrame with 10 rows
data = {
    "Car": np.random.choice(car_names, 10),
    "Rating": np.random.choice(ratings, 10),
    "Price": [np.nan, np.nan, 10000, 15000, 20000, np.nan, 12000, 18000, 22000, 25000]
}

df = pd.DataFrame(data)

# Display the DataFrame
df.head()

Unnamed: 0,Car,Rating,Price
0,Ferrari,Excellent,
1,Lamborgini,Good,
2,Mercedes,Average,10000.0
3,Ferrari,Excellent,15000.0
4,Mercedes,Average,20000.0


In [37]:
#Checking Null values
df.isnull().sum()

Car       0
Rating    0
Price     3
dtype: int64

In [38]:
#Unique car values
df.Car.value_counts()

Ferrari       4
Mercedes      3
Honda         2
Lamborgini    1
Name: Car, dtype: int64

In [39]:
#Unique Rating values
df.Rating.value_counts()

Excellent    3
Good         3
Average      3
Bad          1
Name: Rating, dtype: int64

# 

Conclusion

1.We need to perform missing value imputation on price

2.Ordinal Encoding on Rating

3.One Hot Encoding on Car

This is long process to do it manually especially when there are too many columns in a dataframe, hence we can use ColumnTransformer() which does all this job in one shot

# 

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [41]:
transformer = ColumnTransformer(transformers = [
    ("transformer_1",SimpleImputer(),['Price']) ,
    ("transformer_2",OrdinalEncoder(categories = [['Bad','Average','Good','Excellent']]),['Rating']) ,
    ("transformer_3",OneHotEncoder(sparse=False, drop='first'),['Car'])
],remainder='passthrough')

In [42]:
df_transformed = transformer.fit_transform(df)
df_transformed

array([[1.74285714e+04, 3.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [1.74285714e+04, 2.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00],
       [1.00000000e+04, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [1.50000000e+04, 3.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.00000000e+04, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [1.74285714e+04, 2.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [1.20000000e+04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00],
       [1.80000000e+04, 2.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.20000000e+04, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.50000000e+04, 3.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [44]:
final_df = pd.DataFrame(df_transformed)
final_df

Unnamed: 0,0,1,2,3,4
0,17428.571429,3.0,0.0,0.0,0.0
1,17428.571429,2.0,0.0,1.0,0.0
2,10000.0,1.0,0.0,0.0,1.0
3,15000.0,3.0,0.0,0.0,0.0
4,20000.0,1.0,0.0,0.0,1.0
5,17428.571429,2.0,0.0,0.0,0.0
6,12000.0,0.0,0.0,0.0,1.0
7,18000.0,2.0,1.0,0.0,0.0
8,22000.0,1.0,0.0,0.0,0.0
9,25000.0,3.0,1.0,0.0,0.0
