In [1]:
import pandas as pd
import numpy as np
import warnings as wr
wr.filterwarnings('ignore')

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# LabelEncoder

In [3]:
df1=df.copy()
df1.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['region']=le.fit_transform(df1['region'])
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,3,16884.924
1,18,male,33.77,1,no,2,1725.5523
2,28,male,33.0,3,no,2,4449.462
3,33,male,22.705,0,no,1,21984.47061
4,32,male,28.88,0,no,1,3866.8552


In [5]:
print(le.classes_)

['northeast' 'northwest' 'southeast' 'southwest']


# Using Loop to Encode categorical columns

In [10]:
df2 = df.copy()
df2.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [12]:
for col in df2.columns:
    if df2[col].dtype == np.number:
        continue
    elif df2[col].dtype == 'int64':
        continue
    else:
        df2[col]=le.fit_transform(df2[col])
        
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [13]:
df3 = df.copy()
from pandas.core.dtypes.common import is_numeric_dtype

for c in df3.columns:
    if is_numeric_dtype(df3[c]):
        continue
    df3[c]=le.fit_transform(df3[c])
    

df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# OneHotEncoder

In [14]:
df4=df.copy()
dummy = pd.get_dummies(df4['region'])  #dummy = pd.get_dummies(df4.region)
dummy.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [15]:
df4.drop('region',axis=1,inplace=True)
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [16]:
df5=df.copy()
dummy2 = pd.get_dummies(df.region,drop_first=True,prefix='location')
dummy2.head()

Unnamed: 0,location_northwest,location_southeast,location_southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [17]:
df4 = pd.concat([df4,dummy2],axis=1)
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,location_northwest,location_southeast,location_southwest
0,19,female,27.9,0,yes,16884.924,0,0,1
1,18,male,33.77,1,no,1725.5523,0,1,0
2,28,male,33.0,3,no,4449.462,0,1,0
3,33,male,22.705,0,no,21984.47061,1,0,0
4,32,male,28.88,0,no,3866.8552,1,0,0


# Using Loop OneHotEncoder

In [19]:
df6=df.copy()

for i in df6.columns:
    if is_numeric_dtype(df6[i]):
        continue
    oneHot = pd.get_dummies(df6[i],drop_first=True,prefix='code')  
    df6.drop(df6[[i]],axis=1,inplace=True)
    df6 = pd.concat([df6,oneHot],axis=1)
        
df6.head()

Unnamed: 0,age,bmi,children,charges,code_male,code_yes,code_northwest,code_southeast,code_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# OrdinalEncoder

In [21]:
from sklearn.preprocessing import OrdinalEncoder
df7 = df.copy()
location = df7.region.unique()
location

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [22]:
oe = OrdinalEncoder(categories=[location])
oe_encoded = oe.fit_transform(df7[['region']])
oe_encoded

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [2.]])

In [23]:
df7['region'] = pd.DataFrame(oe_encoded)
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552


# Using loop OrdinalEndoder

In [28]:
df8 = df.copy()
region = df8['region'].unique()
region

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [29]:
for j in df8.columns:
    if is_numeric_dtype(df8[j]):
        continue
    categories_oe2 = df8[j].unique()
    oe2 = OrdinalEncoder(categories=[categories_oe2])
    oe2_encoded = oe2.fit_transform(df8[[j]])
    df8[j] = pd.DataFrame(oe2_encoded)
        
df8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552
