# **Imputing categorical data**

**1. Replacing missing data with mode**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer

In [3]:
data = pd.read_csv(
    "../files/data_science_job.csv", usecols=["enrolled_university", "education_level"]
)

In [4]:
data.isnull().mean()*100

enrolled_university    2.014824
education_level        2.401086
dtype: float64

In [5]:
data.sample(5)

Unnamed: 0,enrolled_university,education_level
11299,no_enrollment,High School
17853,no_enrollment,Masters
2821,no_enrollment,Masters
3134,no_enrollment,Graduate
5557,no_enrollment,Graduate


In [6]:
imputer = SimpleImputer(strategy="most_frequent")

In [7]:
data_new = imputer.fit_transform(data)

In [8]:
data_new = pd.DataFrame(data_new, columns=data.columns)

In [9]:
data_new.isnull().mean()

enrolled_university    0.0
education_level        0.0
dtype: float64

**2. Replacing by word missing**

In [10]:
data_new_2 = data

In [11]:
data_new_2.isnull().mean()*100

enrolled_university    2.014824
education_level        2.401086
dtype: float64

In [12]:
data_new_2["enrolled_university"] = data["enrolled_university"].fillna("Missing")
data_new_2["education_level"] = data["education_level"].fillna("Missing")

In [13]:
data.isnull().sum()

enrolled_university    0
education_level        0
dtype: int64

In [14]:
data_new_3 = data

In [15]:
data_new_3.isnull().sum()

enrolled_university    0
education_level        0
dtype: int64

In [16]:
# using sklearn
imputer = SimpleImputer(strategy="constant", fill_value="Missing")

In [17]:
data_new_3 = imputer.fit_transform(data_new_3)

In [18]:
data_new_3 = pd.DataFrame(data=data_new_3, columns=data.columns)

In [19]:
data_new_3.isnull().sum()

enrolled_university    0
education_level        0
dtype: int64