# Chapter 7 - Data Cleaning and Preparation

## 7.1 Handling Missing Data

In [5]:
import pandas as pd
import numpy as np

In [6]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [7]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [8]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [9]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [10]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [12]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [13]:
float_data = pd.Series([1,2,None], dtype='float64')

In [14]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [15]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

### Filtering Out Missing Data

In [16]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [18]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [21]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [22]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [23]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
data[4] = np.nan

In [25]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [26]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [54]:
df = pd.DataFrame(np.random.standard_normal((7,3)))

In [55]:
df.iloc[:4, 1] = np.nan

In [56]:
df.iloc[:2, 2] = np.nan

In [57]:
df

Unnamed: 0,0,1,2
0,-1.429355,,
1,0.504092,,
2,0.230181,,-0.735074
3,0.125764,,0.303202
4,-0.159793,-0.319491,-0.106808
5,0.981965,-2.217138,1.266485
6,1.342527,0.936903,1.89325


In [58]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.159793,-0.319491,-0.106808
5,0.981965,-2.217138,1.266485
6,1.342527,0.936903,1.89325


In [59]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.230181,,-0.735074
3,0.125764,,0.303202
4,-0.159793,-0.319491,-0.106808
5,0.981965,-2.217138,1.266485
6,1.342527,0.936903,1.89325


### Filling in Missing Data 

In [60]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.429355,0.0,0.0
1,0.504092,0.0,0.0
2,0.230181,0.0,-0.735074
3,0.125764,0.0,0.303202
4,-0.159793,-0.319491,-0.106808
5,0.981965,-2.217138,1.266485
6,1.342527,0.936903,1.89325


In [61]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-1.429355,0.5,0.0
1,0.504092,0.5,0.0
2,0.230181,0.5,-0.735074
3,0.125764,0.5,0.303202
4,-0.159793,-0.319491,-0.106808
5,0.981965,-2.217138,1.266485
6,1.342527,0.936903,1.89325


In [63]:
df = pd.DataFrame(np.random.standard_normal((6,3)))

In [64]:
df.iloc[2:, 1] = np.nan

In [65]:
df.iloc[4:, 2] = np.nan

In [66]:
df

Unnamed: 0,0,1,2
0,-0.978231,-0.814446,0.152642
1,0.187687,0.185405,-0.96643
2,-0.642443,,0.55091
3,-1.98425,,0.03905
4,-0.45397,,
5,-0.772555,,


In [67]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-0.978231,-0.814446,0.152642
1,0.187687,0.185405,-0.96643
2,-0.642443,0.185405,0.55091
3,-1.98425,0.185405,0.03905
4,-0.45397,0.185405,0.03905
5,-0.772555,0.185405,0.03905


In [68]:
df.fillna(method="ffill", limit=2)

  df.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,-0.978231,-0.814446,0.152642
1,0.187687,0.185405,-0.96643
2,-0.642443,0.185405,0.55091
3,-1.98425,0.185405,0.03905
4,-0.45397,,0.03905
5,-0.772555,,0.03905


In [69]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [70]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 Data Transformation

### Removing Duplicates

In [71]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
             "k2": [1, 1, 2, 3, 3, 4, 4]})

In [72]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [73]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [74]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [75]:
data["v1"] = range(7)

In [76]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [78]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [79]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping