# Cleaning Data

In [1]:
import pandas as pd
import numpy as np

In [3]:
dict = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [4]:
df = pd.DataFrame(dict)

In [5]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [9]:
# Dropping rows with any column having an NaN or None values
df2 = df.copy()
df2.dropna(inplace=True)     # Default arguments: df2.dropna(axis='index', how='any')
df2

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [10]:
# Dropping rows with all the columns having NaN or None values
df3 = df.copy()
df3.dropna(axis='index', how='all', inplace=True)
df3

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [14]:
# Dropping columns with any row having NaN or None values
df4 = df.copy()
df4.dropna(axis='columns', how='any', inplace=True)
print('since all the columns had at least 1 NaN or None value, all have been dropped:')
df4

since all the columns had at least 1 NaN or None value, all have been dropped:


0
1
2
3
4
5
6


In [15]:
# Dropping columns with all the rows having NaN or None values
df5 = df.copy()
df5.dropna(axis='columns', how='all', inplace=True)
df5

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [16]:
# Dropping rows with values of a particular column None or NaN
df6 = df.copy()
df6.dropna(axis='index', subset='email', inplace=True)
df6

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [18]:
# Dropping rows with values of any of the list of columns None or NaN
df7 = df.copy()
df7.dropna(axis='index', how='any', subset=["email", "first"], inplace=True)
df7

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [19]:
# Dropping rows with values of all of the list of columns None or NaN
df8 = df.copy()
df8.dropna(axis='index', how='all', subset=["email", "first"], inplace=True)
df8

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


- #### Converting particular values to NaN

In [20]:
# Converting stringified NaN values to real NaN
df9 = df.copy()
df9.replace(["NA", "Missing"], np.nan, inplace=True)
df9

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


- **Now the dropna method will also work on these NaN values which were previously NA or Missing**

In [22]:
# Filling NaN values with predefined values
df9.fillna("MISSING", inplace=True)
df9

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


- #### Loading dataset with certain values set to NaN

In [23]:
NaN_values = ["Missing", "NA"]
df = pd.read_csv('dataset/survey_results_public.csv', na_values=NaN_values)