[Reference](https://anshutrivedik.medium.com/categorical-variable-encoding-in-pandas-c63b27638cef)

- pd.Categorical(column_name).codes
- pd.get_dummies(column_name)
- pd.factorize(column_name)[0]

In [1]:
import pandas as pd

In [2]:
# import train data

train = pd.read_csv('https://raw.githubusercontent.com/AnshuTrivedi/Blog/main/Categorical%20Encoding/train.csv')

In [3]:
# select categorical variables from training data

categorical_cols = train.select_dtypes(include=['object'])
categorical_cols

Unnamed: 0,customer_id,name,gender,owns_car,owns_house,occupation_type
0,CST_115179,ita Bose,F,N,Y,Unknown
1,CST_121920,Alper Jonathan,M,N,Y,Laborers
2,CST_109330,Umesh Desai,M,N,Y,Laborers
3,CST_128288,Rie,F,N,Y,Core staff
4,CST_151355,McCool,M,Y,Y,Core staff
...,...,...,...,...,...,...
45523,CST_130421,Doris,F,N,N,Unknown
45524,CST_136670,Luciana,F,N,Y,Accountants
45525,CST_145435,Jessica,F,N,Y,Core staff
45526,CST_130913,Tessa,M,Y,N,Laborers


# 1. Pd.Categorcal(col_name).codes


In [4]:
# remove first two variables 'name' and 'customer_id'

categorical_cols.columns[2:]

Index(['gender', 'owns_car', 'owns_house', 'occupation_type'], dtype='object')

In [5]:
# convert to column list
columns_list = categorical_cols.columns[2:].tolist()
categorical_cols.columns[2:].tolist()

['gender', 'owns_car', 'owns_house', 'occupation_type']

In [6]:
cat = categorical_cols.copy()

for column in columns_list:
    cat[column] = pd.Categorical(categorical_cols[column]).codes
    
    
cat

Unnamed: 0,customer_id,name,gender,owns_car,owns_house,occupation_type
0,CST_115179,ita Bose,0,0,1,17
1,CST_121920,Alper Jonathan,1,0,1,8
2,CST_109330,Umesh Desai,1,0,1,8
3,CST_128288,Rie,0,0,1,3
4,CST_151355,McCool,1,1,1,3
...,...,...,...,...,...,...
45523,CST_130421,Doris,0,0,0,17
45524,CST_136670,Luciana,0,0,1,0
45525,CST_145435,Jessica,0,0,1,3
45526,CST_130913,Tessa,1,1,0,8


# 2. Pd.get_dummies()


In [7]:
def create_dummies(df,column_name):
    dummies=pd.get_dummies(df[column_name],prefix=column_name)
    df=pd.concat([df,dummies],axis=1)
    return df

cat_dummy = categorical_cols.copy()

for column in columns_list:
    cat_dummy = create_dummies(cat_dummy,column)
    
    
cat_dummy

Unnamed: 0,customer_id,name,gender,owns_car,owns_house,occupation_type,gender_F,gender_M,gender_XNA,owns_car_N,...,occupation_type_Low-skill Laborers,occupation_type_Managers,occupation_type_Medicine staff,occupation_type_Private service staff,occupation_type_Realty agents,occupation_type_Sales staff,occupation_type_Secretaries,occupation_type_Security staff,occupation_type_Unknown,occupation_type_Waiters/barmen staff
0,CST_115179,ita Bose,F,N,Y,Unknown,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,CST_121920,Alper Jonathan,M,N,Y,Laborers,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,CST_109330,Umesh Desai,M,N,Y,Laborers,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,CST_128288,Rie,F,N,Y,Core staff,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,CST_151355,McCool,M,Y,Y,Core staff,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45523,CST_130421,Doris,F,N,N,Unknown,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
45524,CST_136670,Luciana,F,N,Y,Accountants,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
45525,CST_145435,Jessica,F,N,Y,Core staff,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
45526,CST_130913,Tessa,M,Y,N,Laborers,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
cat_dummy2 = categorical_cols.copy()
pd.get_dummies(cat_dummy2.iloc[:,2:])

Unnamed: 0,gender_F,gender_M,gender_XNA,owns_car_N,owns_car_Y,owns_house_N,owns_house_Y,occupation_type_Accountants,occupation_type_Cleaning staff,occupation_type_Cooking staff,...,occupation_type_Low-skill Laborers,occupation_type_Managers,occupation_type_Medicine staff,occupation_type_Private service staff,occupation_type_Realty agents,occupation_type_Sales staff,occupation_type_Secretaries,occupation_type_Security staff,occupation_type_Unknown,occupation_type_Waiters/barmen staff
0,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45523,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
45524,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
45525,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45526,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 3. pd.factorize()


In [9]:
cat_fac = categorical_cols.copy()

for column in columns_list:
    cat_fac[column] = pd.factorize(cat_fac[column])[0]
    
cat_fac

Unnamed: 0,customer_id,name,gender,owns_car,owns_house,occupation_type
0,CST_115179,ita Bose,0,0,0,0
1,CST_121920,Alper Jonathan,1,0,0,1
2,CST_109330,Umesh Desai,1,0,0,1
3,CST_128288,Rie,0,0,0,2
4,CST_151355,McCool,1,1,0,2
...,...,...,...,...,...,...
45523,CST_130421,Doris,0,0,1,0
45524,CST_136670,Luciana,0,0,0,3
45525,CST_145435,Jessica,0,0,0,2
45526,CST_130913,Tessa,1,1,1,1
