# Import libraries and read data

In [1]:
import pandas as pd
import numpy as np

In [2]:
dic_data = {'Id':[1,2,3,4,5],
            'Age':[30,24,20,17,25],
            'salary':[1000,2000,1500,2500,3000],
            'Qualification':['Master','Bachelor','Master','PhD','Bachelor'],
            'Gender':['Male','Male','Female','Female','Female']
           }

In [3]:
df = pd.DataFrame.from_dict(dic_data)

In [4]:
df

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,Master,Male
1,2,24,2000,Bachelor,Male
2,3,20,1500,Master,Female
3,4,17,2500,PhD,Female
4,5,25,3000,Bachelor,Female


In [5]:
df.dtypes

Id                int64
Age               int64
salary            int64
Qualification    object
Gender           object
dtype: object

![separator2](https://i.imgur.com/4gX5WFr.png)

# 1. Deal with categorical data [ORDINAL]
``` Order is important ```

In [6]:
df['Qualification'] = pd.Categorical(df['Qualification'],['Bachelor','Master','PhD'],ordered=True)
# if the column has any value other than those categories it will be replaced with null (nan).
# Order will start from 0 to more on the categories you add here in order.
# so Bachelor = 0 , Master = 1 , PhD = 2

In [7]:
df.Qualification.dtype

CategoricalDtype(categories=['Bachelor', 'Master', 'PhD'], ordered=True)

In [8]:
df.Qualification

0      Master
1    Bachelor
2      Master
3         PhD
4    Bachelor
Name: Qualification, dtype: category
Categories (3, object): ['Bachelor' < 'Master' < 'PhD']

In [9]:
df['Qualification']=df['Qualification'].cat.codes
# will became int values

In [10]:
df['Qualification']

0    1
1    0
2    1
3    2
4    0
Name: Qualification, dtype: int8

In [11]:
df.dtypes

Id                int64
Age               int64
salary            int64
Qualification      int8
Gender           object
dtype: object

In [12]:
df

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,Male
1,2,24,2000,0,Male
2,3,20,1500,1,Female
3,4,17,2500,2,Female
4,5,25,3000,0,Female


In [13]:
df2 = df.copy()

![separator2](https://i.imgur.com/4gX5WFr.png)


# 2. Deal with categorical data [NOMINAL]
``` Not ordered values ```

## 2.1 In case the attribute has only 2 values

In [14]:
df['Gender'] = pd.Categorical(df['Gender'],['Male','Female'])

In [15]:
df.Gender.dtype

CategoricalDtype(categories=['Male', 'Female'], ordered=False)

In [18]:
df.Gender

0      Male
1      Male
2    Female
3    Female
4    Female
Name: Gender, dtype: category
Categories (2, object): ['Male', 'Female']

In [19]:
df['Gender'] = df['Gender'].cat.codes

In [20]:
df['Gender']

0    0
1    0
2    1
3    1
4    1
Name: Gender, dtype: int8

In [21]:
df

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,0
1,2,24,2000,0,0
2,3,20,1500,1,1
3,4,17,2500,2,1
4,5,25,3000,0,1


## 2.2 In case that the column has 2 or more than values

In [31]:
df2

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,Male
1,2,24,2000,0,Male
2,3,20,1500,1,Female
3,4,17,2500,2,Female
4,5,25,3000,0,Female


In [30]:
# For testing some cases
df3 = df2.copy()
df4 = df2.copy()
df5 = df2.copy()
df6 = df2.copy()

In [24]:
df2.dtypes

Id                int64
Age               int64
salary            int64
Qualification      int8
Gender           object
dtype: object

In [26]:
df2_categorized = pd.get_dummies(df2)

In [27]:
df2_categorized

Unnamed: 0,Id,Age,salary,Qualification,Gender_Female,Gender_Male
0,1,30,1000,1,0,1
1,2,24,2000,0,0,1
2,3,20,1500,1,1,0
3,4,17,2500,2,1,0
4,5,25,3000,0,1,0


In [29]:
df2_categorized.dtypes

Id               int64
Age              int64
salary           int64
Qualification     int8
Gender_Female    uint8
Gender_Male      uint8
dtype: object

![separator2](https://i.imgur.com/4gX5WFr.png)


# 3. Columns has unwanted values (must be handled before)

In [38]:
df4

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,Male
1,2,24,2000,0,Male
2,3,20,1500,1,Female
3,4,17,2500,2,Female
4,5,25,3000,0,Female


In [39]:
df4.dtypes

Id                int64
Age               int64
salary            int64
Qualification      int8
Gender           object
dtype: object

In [40]:
df4.iloc[4,4] = 'G'

In [41]:
df4

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,Male
1,2,24,2000,0,Male
2,3,20,1500,1,Female
3,4,17,2500,2,Female
4,5,25,3000,0,G


In [42]:
df4.dtypes

Id                int64
Age               int64
salary            int64
Qualification      int8
Gender           object
dtype: object

In [47]:
df4['Gender'] = pd.Categorical(df4['Gender'],['Male','Female'])
# make categories Male , Female only and others be nan

In [48]:
df4

Unnamed: 0,Id,Age,salary,Qualification,Gender
0,1,30,1000,1,Male
1,2,24,2000,0,Male
2,3,20,1500,1,Female
3,4,17,2500,2,Female
4,5,25,3000,0,


In [None]:
# you must deal with this nan using any algo

In [45]:
dfTest = pd.get_dummies(df4)

In [49]:
dfTest

Unnamed: 0,Id,Age,salary,Qualification,Gender_Male,Gender_Female
0,1,30,1000,1,1,0
1,2,24,2000,0,1,0
2,3,20,1500,1,0,1
3,4,17,2500,2,0,1
4,5,25,3000,0,0,0
