# Cleaning Data - Casting Datatypes and Handling Missing Values

In [2]:
import numpy as np
import pandas as pd

In [5]:
people = {
    "first" : ["Abhishek", "Virat", "Manit"],
    "last" : ["Dhawan", "Kohli", "Sharma"],
    "email" : ["abhidhawan09@gmail.com", "virat.kohli@gmail.com", "manitsharma1990@gmail.com"]
}

In [6]:
df = pd.DataFrame(people)
print(df)

      first    last                      email
0  Abhishek  Dhawan     abhidhawan09@gmail.com
1     Virat   Kohli      virat.kohli@gmail.com
2     Manit  Sharma  manitsharma1990@gmail.com


# Use dropna() to drop rows/columns having missing values NaN

- axis=0 or axis="index" by default

- axis=1 or axis="columns" to drop missing value columns

- how parameter takes any/all to drop rows/columns contains any number of NaN values or all NaN values.

- subset takes a list of column on which it will check NaN to drop.

- If only 1 column is provided, then how parameter will not be used.

In [7]:
df.dropna().reset_index(drop=True)

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [8]:
df.dropna(axis="index", how="all")

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [9]:
df.dropna(axis="index", how="all")# how parameter is ignored because it is only 1 parameter

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [10]:
df.dropna(axis="index", how="all", subset=["last", "email"])

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [11]:
df

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [12]:
# replacing NA or "Missing" with np.NaN
df.replace(["NA", "Missing"], np.NaN, inplace=True)
df

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [13]:
# NaN is of float type, so integer columns will convert to float
type(np.NaN)

float

In [14]:
df.dropna(axis="index", subset=["email"])

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [15]:
df.dropna(axis="index", how="all", subset=["last", "email"])

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


# Use isna() to check where the NaN values are present


In [16]:
df.isna()

Unnamed: 0,first,last,email
0,False,False,False
1,False,False,False
2,False,False,False


# Use fillna() to fill missing values with provided value/series/dict

In [17]:
df.fillna("MISSING")

Unnamed: 0,first,last,email
0,Abhishek,Dhawan,abhidhawan09@gmail.com
1,Virat,Kohli,virat.kohli@gmail.com
2,Manit,Sharma,manitsharma1990@gmail.com


In [19]:
df.fillna({"age":0}, inplace=True)
print(df)

      first    last                      email
0  Abhishek  Dhawan     abhidhawan09@gmail.com
1     Virat   Kohli      virat.kohli@gmail.com
2     Manit  Sharma  manitsharma1990@gmail.com
