# ENCODING

In [1]:
from  module import load_dataset, get_col_names
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x:'%.3f' % x)
pd.set_option('display.width', 500)

- Encoding : Changing the representation of variables
1. Label Encoding (Female, Male -> 0,1)
2. One hot Encoder
3. Rare Encoding
- If the labels are ordinal for example education
High School, University, Master we can encode them to 1,2,3 because numbers increasing meaning higher education, also I can do one hot encoding
- If the lables not ordinal (nominal) for example teams we cant use 1,2,3,.. becuse the models recognize this numbers as magnitude. The labels don't have differneces between them. They are not sequential Because of that we cant use label encoding we use one hot encoding

### Label Encoding & Binary Encoding
- If categoric variable has two categories and encode as 0,1 it called binary encoding
- If it has more than two categories and encode with label encoder it called label encoding

In [3]:
df = load_dataset()

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df["Sex"].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

- Why encoding?
1. Because generally machine learning models works with numerical representations. They don't understand strings.The algorithms expect standart format. Weare trying to get the data fit for this standarts
2. To improve model prediction performance, variableize the class of a categorical variable that may be important and assign a value to it.

In [6]:
# Defining label encoder object
le = LabelEncoder()

# Fit the Sex variable to label encoder then transform its values
le.fit_transform(df['Sex'])[0:5]

array([1, 0, 0, 0, 1])

- Label Encoder gives the values according to alpabetic order

In [7]:
# To see which number presents which category
# The transform info store in the label encoder
le.inverse_transform([0,1])

array(['female', 'male'], dtype=object)

In [8]:
def label_encoder(df, binary_col):
    """ Label encoding binary col in given dataframe
        Returns the dataframe with the encoded col

        df -dataframe
        binary col - string- name of the binary column -two class variable- that to be encoded
    """
    label_encoder = LabelEncoder()
    df[binary_col] = label_encoder.fit_transform(df[binary_col])
    return df

In [9]:
################################
# We can do label encoding in two ways
# We can either implement this method or one hot encoding
# While implementing one hot encoding we use get_dummies() method 
# and if we do drop_first = True two class categorical variables would be label encoded too.

In [23]:
df = load_dataset()

In [12]:
# How to implement this for every binary variable in the dataframe
# Select binary cols in the dataframe
# len(unique) also counts missing values as a class so nunique preferred
binary_cols = [col for col in df.columns if df[col].dtype in [object] and df[col].nunique() == 2]

In [13]:
binary_cols

['Sex']

In [14]:
for col in binary_cols:
    label_encoder(df,col)

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [16]:
df = load_dataset(data_url='data/application_train.csv')

In [17]:
df.shape

(307511, 122)

In [18]:
binary_cols = [col for col in df.columns if df[col].dtype in [object] and df[col].nunique() == 2]
binary_cols

['NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'EMERGENCYSTATE_MODE']

In [19]:
df[binary_cols].head()

Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,EMERGENCYSTATE_MODE
0,Cash loans,N,Y,No
1,Cash loans,N,N,No
2,Revolving loans,Y,Y,
3,Cash loans,N,Y,
4,Cash loans,N,Y,


In [20]:
for col in binary_cols:
    label_encoder(df,col)

In [22]:
# Nan values filled with 2
# Normally missing values handled earlier
# But filling missing values like this is a common choice too but you should be aware of nan values are 2
df[binary_cols].head()

Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,EMERGENCYSTATE_MODE
0,0,0,1,0
1,0,0,0,0
2,1,1,1,2
3,0,0,1,2
4,0,0,1,2


#### nunıque vs len(unique)
- If you want to use nan you can use len(unique)
- If you want to ignore nan values you can use nunıque

In [27]:
df=load_dataset()

In [24]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [25]:
df['Embarked'].nunique()

3

In [26]:
len(df['Embarked'].unique())

4