# Python for Data Analysis
    
    AUTHOR: Dr. Wes McKinney 

### Chapter 7: Data Cleaning and Preparation
### **7.1 Handling Missing Data**

In [1]:
import numpy as np
import pandas as pd

In [2]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
string_data
string_data.isna()
float_data = pd.Series([1, 2, None], dtype='float64')
float_data
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [5]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [8]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [9]:
data[4] = np.nan
data
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [10]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df
df.dropna()
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.206202,,0.859282
3,-0.219137,,0.055068
4,0.072489,0.289896,0.069934
5,2.171594,-1.167594,0.594663
6,-0.6029,-0.995231,-0.295665


In [11]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.601455,0.0,0.0
1,-1.272611,0.0,0.0
2,1.206202,0.0,0.859282
3,-0.219137,0.0,0.055068
4,0.072489,0.289896,0.069934
5,2.171594,-1.167594,0.594663
6,-0.6029,-0.995231,-0.295665


In [12]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.601455,0.5,0.0
1,-1.272611,0.5,0.0
2,1.206202,0.5,0.859282
3,-0.219137,0.5,0.055068
4,0.072489,0.289896,0.069934
5,2.171594,-1.167594,0.594663
6,-0.6029,-0.995231,-0.295665


In [13]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df
df.fillna(method="ffill")
df.fillna(method="ffill", limit=2)

  df.fillna(method="ffill")
  df.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,0.619289,0.587265,-0.249435
1,1.172784,0.179423,-0.679014
2,0.765167,0.179423,-1.648644
3,-0.176358,0.179423,-1.601786
4,-1.143539,,-1.601786
5,0.09507,,-1.601786


In [14]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# Removing Duplicates

In [15]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                     "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [16]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [17]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [18]:
data["v1"] = range(7)
data
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [19]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
