In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [3]:
df = pd.DataFrame(people)

In [4]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


# - dropna() method  :-
# --->  this method drops the columns that contains NaN or NULL value

In [5]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [6]:
df.dropna(axis = 'index',how ='any') # this is the criteria that .dropna() method uses as default


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


# If axis is set to index or 0 then it will drop rows that has null values 
# And if axis is set to columns or 1 then it will drop columns that has null values
 # how argument decides whether to remove the row or column when ALL values are NULL, or if ANY value is NULL.

In [7]:
df.dropna(axis = 'index',how ='all') 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [8]:
df.dropna(axis = 'columns',how ='all') 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [9]:
df.dropna(axis = 'columns',how ='any')   # this gives an empty dataframe as it drops all columns that has NaN/Null value


0
1
2
3
4
5
6


In [10]:
df.dropna(axis='index', how='all', subset= 'email')  # it doesn't drop a row if its email is filled

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [11]:
df.dropna(axis='index', how='all', subset= ['last','email']) 
# last waa email column maa data xa bhane drop gadaina 


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [12]:
df 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [13]:
df.replace('NA',np.nan,inplace= True)
df.replace('Missing',np.nan,inplace= True)
# replacing NA and Missing with NaN

In [14]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


# isna() method is used for checking NaN value

In [15]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


# fillna() is used for filling NaN values with another fillable values

In [16]:
df.fillna('MISSING!!')  # fills NaN with MISSING!! 

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING!!,36
4,MISSING!!,MISSING!!,MISSING!!,MISSING!!
5,MISSING!!,MISSING!!,Anonymous@email.com,MISSING!!
6,MISSING!!,MISSING!!,MISSING!!,MISSING!!


In [17]:
df.fillna(0)  # fills NA with 0

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [19]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [21]:
df['age'].mean()   # this gives error because age column's datatype is object not int sooo..

TypeError: can only concatenate str (not "int") to str

In [22]:
# checking datatype of NaN

type(np.nan)

float

In [24]:
# changing the datatype of age column 

#df['age']=df['age'].astype(int) # if the columns doesn't have any Nan/missing values then is doesn't gives any error

# so instead of int we change it into float as datatype of NaN is float

df['age']=df['age'].astype(float)

In [25]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [26]:
df['age'].mean() 

46.75