In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
insurance_df = pd.read_csv("insurance.csv")

In [3]:
insurance_df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [4]:
insurance_labelencoded = insurance_df.copy()
insurance_labelencoded_loop = insurance_df.copy()
insurance_onehotencoded = insurance_df.copy()
insurance_ordinal = insurance_df.copy()

# Label encoder without loop

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

In [7]:
le.fit_transform(insurance_labelencoded["sex"])

array([0, 1, 1, ..., 0, 0, 0])

In [8]:
insurance_labelencoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [9]:
insurance_labelencoded["sex"] = le.fit_transform(insurance_labelencoded["sex"])
insurance_labelencoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [10]:
insurance_labelencoded["smoker"] = le.fit_transform(insurance_labelencoded["smoker"])
insurance_labelencoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [11]:
insurance_labelencoded["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [12]:
insurance_labelencoded["region"] = le.fit_transform(insurance_labelencoded["region"])
insurance_labelencoded.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# Perfoming encoding using loop on condition

In [13]:
from pandas.core.dtypes.common import is_numeric_dtype, is_categorical_dtype

In [14]:
insurance_labelencoded_loop.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
for column in insurance_labelencoded_loop.columns:
    if is_numeric_dtype(insurance_labelencoded_loop[column]):
        continue
    else:
        insurance_labelencoded_loop[column] = le.fit_transform(insurance_labelencoded_loop[column])

insurance_labelencoded_loop.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [16]:
is_categorical_dtype(insurance_df['sex'])

False

# Onehot encoding using pandas

In [17]:
insurance_oh_pandas = insurance_df.copy()
insurance_oh_pandas.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
pd.get_dummies(insurance_oh_pandas["sex"], prefix="sex", drop_first=True)

Unnamed: 0,sex_male
0,0
1,1
2,1
3,1
4,1
...,...
1333,1
1334,0
1335,0
1336,0


# Onehot encoding `sklearn`

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
ohe = OneHotEncoder(handle_unknown="ignore", drop="first")

In [21]:
insurance_onehotencoded["sex"].shape

(1338,)

In [22]:
insurance_onehotencoded["sex"].values.reshape(-1,1).shape

(1338, 1)

In [23]:
ohe.fit_transform(insurance_onehotencoded["sex"].values.reshape(-1,1)).toarray()

array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [24]:
ohe.categories_

[array(['female', 'male'], dtype=object)]

In [25]:
ohe.fit_transform(insurance_onehotencoded["smoker"].values.reshape(-1,1)).toarray()

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [26]:
ohe.categories_

[array(['no', 'yes'], dtype=object)]

In [27]:
ohe.fit_transform(insurance_onehotencoded[["sex", "smoker", "region"]]).toarray()

array([[0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.]])

In [28]:
ohe_array = ohe.fit_transform(insurance_onehotencoded[["sex", "smoker", "region"]]).toarray()
ohe_array

array([[0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.]])

In [29]:
ohe.categories_

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [30]:
features = ohe.categories_
features

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [31]:
features_lst = []
for i in features:
    for j in i[1:]:
        features_lst.append(j)
features_lst

['male', 'yes', 'northwest', 'southeast', 'southwest']

In [32]:
oh_encoded_features = pd.DataFrame(ohe_array, columns=features_lst)
oh_encoded_features.head()

Unnamed: 0,male,yes,northwest,southeast,southwest
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0


In [33]:
pd.concat([insurance_onehotencoded, oh_encoded_features], axis=1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,male,yes,northwest,southeast,southwest
0,19,female,27.900,0,yes,southwest,16884.92400,0.0,1.0,0.0,0.0,1.0
1,18,male,33.770,1,no,southeast,1725.55230,1.0,0.0,0.0,1.0,0.0
2,28,male,33.000,3,no,southeast,4449.46200,1.0,0.0,0.0,1.0,0.0
3,33,male,22.705,0,no,northwest,21984.47061,1.0,0.0,1.0,0.0,0.0
4,32,male,28.880,0,no,northwest,3866.85520,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,1.0,0.0,1.0,0.0,0.0
1334,18,female,31.920,0,no,northeast,2205.98080,0.0,0.0,0.0,0.0,0.0
1335,18,female,36.850,0,no,southeast,1629.83350,0.0,0.0,0.0,1.0,0.0
1336,21,female,25.800,0,no,southwest,2007.94500,0.0,0.0,0.0,0.0,1.0


In [34]:
dropped_insurace_ohe = insurance_onehotencoded.drop(['sex', 'smoker', 'region'], axis=1)
dropped_insurace_ohe.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [35]:
pd.concat([dropped_insurace_ohe, oh_encoded_features], axis=1)

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.900,0,16884.92400,0.0,1.0,0.0,0.0,1.0
1,18,33.770,1,1725.55230,1.0,0.0,0.0,1.0,0.0
2,28,33.000,3,4449.46200,1.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,1.0,0.0,1.0,0.0,0.0
4,32,28.880,0,3866.85520,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1.0,0.0,1.0,0.0,0.0
1334,18,31.920,0,2205.98080,0.0,0.0,0.0,0.0,0.0
1335,18,36.850,0,1629.83350,0.0,0.0,0.0,1.0,0.0
1336,21,25.800,0,2007.94500,0.0,0.0,0.0,0.0,1.0


# Ordinal encoding

In [36]:
from sklearn.preprocessing import OrdinalEncoder

In [37]:
insurance_ordinal.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [38]:
city = insurance_ordinal.region.unique()
city

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [39]:
oe = OrdinalEncoder(categories=[city])

In [40]:
oe_area = oe.fit_transform(insurance_ordinal[["region"]])
oe_area

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [2.]])

In [41]:
oe.categories_

[array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)]

In [42]:
insurance_ordinal.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [43]:
insurance_ordinal["region"] = oe_area
insurance_ordinal.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552
5,31,female,25.74,0,no,1.0,3756.6216
6,46,female,33.44,1,no,1.0,8240.5896
7,37,female,27.74,3,no,2.0,7281.5056
8,37,male,29.83,2,no,3.0,6406.4107
9,60,female,25.84,0,no,2.0,28923.13692
