In [1]:
import pandas as pd
import numpy as np

### 1 .Working with missing data

In [2]:
df = pd.DataFrame(np.arange(20).reshape(4,5),columns=["one","two","three","four","five"])
df.iloc[1,1:3]=np.nan
df.iloc[1:3,2:4]=np.nan
print(df)

   one   two  three  four  five
0    0   1.0    2.0   3.0     4
1    5   NaN    NaN   NaN     9
2   10  11.0    NaN   NaN    14
3   15  16.0   17.0  18.0    19


#### Identifying missing data

In [3]:
df.sum(axis=0,skipna=False)

one      30.0
two       NaN
three     NaN
four      NaN
five     46.0
dtype: float64

In [4]:
df.isna().sum(axis=0)

one      0
two      1
three    2
four     2
five     0
dtype: int64

In [5]:
df.isna().any()

one      False
two       True
three     True
four      True
five     False
dtype: bool

In [6]:
df.notna().all()

one       True
two      False
three    False
four     False
five      True
dtype: bool

#### Filling missing data

In [7]:
df.fillna(0, inplace=True)
#print(df)
df.iloc[1,1:3]=np.nan
df.iloc[1:3,2:4]=np.nan
df.fillna('missing')

Unnamed: 0,one,two,three,four,five
0,0,1,2,3,4
1,5,missing,missing,missing,9
2,10,11,missing,missing,14
3,15,16,17,18,19


Fill gaps forward or backward

In [8]:
df.fillna(method='pad')

Unnamed: 0,one,two,three,four,five
0,0,1.0,2.0,3.0,4
1,5,1.0,2.0,3.0,9
2,10,11.0,2.0,3.0,14
3,15,16.0,17.0,18.0,19


Limit the amount of filling

In [9]:
df.fillna(method='bfill',limit=1)

Unnamed: 0,one,two,three,four,five
0,0,1.0,2.0,3.0,4
1,5,11.0,,,9
2,10,11.0,17.0,18.0,14
3,15,16.0,17.0,18.0,19


Filling with a PandasObject

In [10]:
df.fillna(df.mean())

Unnamed: 0,one,two,three,four,five
0,0,1.0,2.0,3.0,4
1,5,9.333333,9.5,10.5,9
2,10,11.0,9.5,10.5,14
3,15,16.0,17.0,18.0,19


Filling missing data with Interpolation

In [11]:
df.interpolate(method='akima')

Unnamed: 0,one,two,three,four,five
0,0,1.0,2.0,3.0,4
1,5,6.0,8.666667,9.666667,9
2,10,11.0,13.666667,14.666667,14
3,15,16.0,17.0,18.0,19


In [12]:
df.interpolate(method='polynomial', order=1)

Unnamed: 0,one,two,three,four,five
0,0,1.0,2.0,3.0,4
1,5,6.0,7.0,8.0,9
2,10,11.0,12.0,13.0,14
3,15,16.0,17.0,18.0,19


#### Dropping axis labels with missing data: dropna

In [13]:
print(df.dropna(axis=0))
print("\n")
df.dropna(axis=1)

   one   two  three  four  five
0    0   1.0    2.0   3.0     4
3   15  16.0   17.0  18.0    19




Unnamed: 0,one,five
0,0,4
1,5,9
2,10,14
3,15,19


### 2. Dropping duplicate data

In [14]:
df = pd.DataFrame(np.arange(20).reshape(4,5),columns=["one","two","three","four","five"])
df.iloc[3,1]=1
print(df)
df.drop_duplicates()

   one  two  three  four  five
0    0    1      2     3     4
1    5    6      7     8     9
2   10   11     12    13    14
3   15    1     17    18    19


Unnamed: 0,one,two,three,four,five
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,1,17,18,19


In [15]:
df.drop_duplicates(subset=['two'], inplace=True)
print(df)

   one  two  three  four  five
0    0    1      2     3     4
1    5    6      7     8     9
2   10   11     12    13    14
