# Handling Categorical Columns without using column transformer

In [174]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler,MaxAbsScaler
from sklearn.feature_selection import SelectKBest,chi2 

#display pipeline

from sklearn import set_config
set_config(display='diagram')

In [4]:
df_customer = pd.read_csv('customer.csv')
df_car = pd.read_csv('cars.csv')


In [8]:
df_customer.sample()

Unnamed: 0,age,gender,review,education,purchased
25,57,Female,Good,School,No


In [7]:
df_car.sample()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2985,Volkswagen,60000,Petrol,Third Owner,360000


In [14]:
#train test split 
x_train, x_test, y_train, y_test = train_test_split(df_customer.drop(columns=['purchased'],axis=1),df_customer.purchased,test_size=0.1,random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((45, 4), (5, 4), (45,), (5,))

In [47]:
# Oridinal encoding on education and review of customer dataframe
# Label encoding on purchased 
# Nominal encoding of gender

oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
x_train_trf_oe = oe.fit_transform(x_train[['review','education']])
x_test_trf_oe = oe.transform(x_test[['review','education']])

OHE = OneHotEncoder(sparse=False, drop='first')
x_train_trf_OHE = OHE.fit_transform(x_train[['gender']])
x_test_trf_OHE  = OHE.transform(x_test[['gender']])


In [48]:
x_train_trf_oe.shape,  x_test_trf_oe.shape

((45, 2), (5, 2))

In [49]:
x_train_trf_OHE.shape, x_test_trf_OHE.shape

((45, 1), (5, 1))

In [50]:
x_train['age'].values

array([39, 53, 57, 92, 97, 51, 16, 94, 65, 72, 18, 23, 64, 38, 75, 74, 59,
       16, 86, 22, 30, 77, 69, 89, 31, 83, 74, 34, 68, 32, 70, 27, 74, 96,
       39, 98, 18, 19, 25, 57, 60, 30, 15, 48, 45], dtype=int64)

In [57]:
x_train_encoding_columns = np.hstack((x_train[['age']].values,x_train_trf_oe,x_train_trf_OHE))
x_train_encoding_columns

x_test_encoding_columns = np.hstack((x_test[['age']].values,x_test_trf_oe,x_test_trf_OHE))
x_test_encoding_columns


array([[57.,  1.,  0.,  0.],
       [76.,  0.,  2.,  1.],
       [73.,  1.,  1.,  1.],
       [61.,  0.,  2.,  1.],
       [22.,  0.,  1.,  0.]])

In [53]:
#label encoding on target columns
le= LabelEncoder()
y_train_trf_le = le.fit_transform(y_train)
y_test_trf_le = le.transform(y_test)
y_train_trf_le.shape, y_test_trf_le.shape 

((45,), (5,))

### Encoding the categorical of car Dataframe


In [122]:
#dataframe brand name of car changed to uncommon which has count less than 100
car_less_than_100 = df_car.brand.value_counts()[df_car.brand.value_counts() < 100].index
car_less_than_100.nunique()

df_car.brand.replace(car_less_than_100,'uncommon',inplace=True)
df_car.brand.value_counts()

Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
uncommon       538
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Name: brand, dtype: int64

In [123]:
#Train test split 
x_train, x_test, y_train, y_test = train_test_split(df_car.drop(columns=['selling_price'],axis=1),df_car.selling_price,test_size=0.1,random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((7315, 4), (813, 4), (7315,), (813,))

In [124]:
# Onehot encoding required on brand,fuel and owner 
OHE = OneHotEncoder(sparse=False, drop='first')
x_train_trf_OHE = OHE.fit_transform(x_train[['brand','fuel','owner']])
x_test_trf_OHE = OHE.transform(x_test[['brand','fuel','owner']])

x_train_trf_OHE.shape, x_test_trf_OHE.shape 

((7315, 19), (813, 19))

In [132]:
x_train_encoded = np.hstack((x_train[['km_driven']].values,x_train_trf_OHE))

x_test_encoded = np.hstack((x_test[['km_driven']].values,x_test_trf_OHE))

x_train_encoded.shape, x_test_encoded.shape

((7315, 20), (813, 20))

# Handling Categorical Columns with using column transformer and Pipeline

In [134]:
df_titanic = pd.read_csv('train.csv')

In [137]:
df_titanic.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)
# Drop the passengerID, Name, Ticket, Cabin in order to get simple intution of using column transformer  

In [138]:
df_titanic.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
164,0,3,male,1.0,4,1,39.6875,S
791,0,2,male,16.0,0,0,26.0,S
87,0,3,male,,0,0,8.05,S
816,0,3,female,23.0,0,0,7.925,S
586,0,2,male,47.0,0,0,15.0,S


In [141]:
#Train test split the dataframe
x_train, x_test, y_train, y_test = train_test_split(df_titanic.drop(columns=['Survived']),df_titanic.Survived,test_size=0.2,random_state=2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [140]:
df_titanic.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [196]:
#Pipe Line steps - Simple Imputation (Age and Embarked column), Encoding (Sex and Embarked), Scaling all the columns,
                #  Feature selection and applying model (algorithm)
#Step 1 Simple imputation

trf1 = ColumnTransformer(transformers=[
    ('Age_imputor',SimpleImputer(),[2]),
    ('Embarked_imputor',SimpleImputer(strategy='most_frequent'),[6])
    
],remainder='passthrough')

In [197]:
#Step 2 OHE on sex and embarked columns

trf2 = ColumnTransformer(transformers=[
    ('OHE_Sex_Embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])    
],remainder='passthrough')

In [203]:
#Step 3 Scaling all the columns

trf3 = ColumnTransformer(transformers=[
    ('Scale',MinMaxScaler(),slice(1,10))    
])


In [216]:
#step 4 Selecting the best feature 

trf4 = SelectKBest(score_func=chi2,k=9)


In [217]:
#step 5 Algorithm 

trf5 = DecisionTreeClassifier()

In [218]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])


In [219]:
pipe.fit(x_train,y_train)

In [220]:
y_pred = pipe.predict(x_test)
accuracy_score(y_test,y_pred)

0.6368715083798883