## Data Cleaning and Pandas

See Chapter 7 of "Python for Data Analysis" by Wes McKinney

In [1]:
import pandas as pd
import numpy as np

### Detecting and dealing with missing data

python uses both null and NaN

in Pandas,  the convention is to use NaN to indicate missing data

### Comment on dropping Na values

There are times when it seems to okay to simply drop Nas,   if a variable has over 50% of the variables missing,  it may be simply useless

But, in general,  we really want to know why data is missing before we remove it.

If there are patterns or correlations in the missing data, that means something.

Some ML tools can use Na values in the model,  such as GBMLight,   so they attempt to include the Na values as data input

In [2]:
stuff=['cat','dog','hamster',np.nan, 'aardvark']
stuff2=[1,2,None,3,4]

first_df=pd.DataFrame(list(zip(stuff,stuff2)),columns=['animal','flag'])

first_df.head()

Unnamed: 0,animal,flag
0,cat,1.0
1,dog,2.0
2,hamster,
3,,3.0
4,aardvark,4.0


In [3]:
first_df.isnull()

Unnamed: 0,animal,flag
0,False,False
1,False,False
2,False,True
3,True,False
4,False,False


In [4]:
#Question-  what is happeing here with NaN and null?  How do the detection functions isnull and isna seem to work

In [5]:
# True if the value is null or NA, False if it is not

0    False
1    False
2    False
3     True
4    False
Name: animal, dtype: bool

In [6]:
first_df.flag.isna()

0    False
1    False
2     True
3    False
4    False
Name: flag, dtype: bool

In [7]:
first_df.animal.notnull()

0     True
1     True
2     True
3    False
4     True
Name: animal, dtype: bool

In [8]:
# Automatic removal of rows or columns with NA

In [43]:
from numpy import nan as NA

# note the aliasing of nan as NA,   this may be easy to read

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [44]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [45]:
data[4]=NA
data
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


# look up pandas dropna,    what other options are there for how=?,    what is axis=1 doing?   What is the default axis?

In [12]:
df = pd.DataFrame(np.random.randn(7, 3))

df.iloc[:4, 1] = NA

df.iloc[:2, 2] = NA

df

Unnamed: 0,0,1,2
0,0.938131,,
1,1.234206,,
2,0.082886,,0.972992
3,0.352073,,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


In [13]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.082886,,0.972992
3,0.352073,,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


# Filling in missing data

imputing or filling in constants

In [14]:
df.fillna(0)


Unnamed: 0,0,1,2
0,0.938131,0.0,0.0
1,1.234206,0.0,0.0
2,0.082886,0.0,0.972992
3,0.352073,0.0,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


In [15]:
df.fillna(data.mean())

Unnamed: 0,0,1,2
0,0.938131,6.5,3.0
1,1.234206,6.5,3.0
2,0.082886,6.5,0.972992
3,0.352073,6.5,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


In [16]:
df.fillna({1:0.5,2.0:0})

Unnamed: 0,0,1,2
0,0.938131,0.5,0.0
1,1.234206,0.5,0.0
2,0.082886,0.5,0.972992
3,0.352073,0.5,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


In [17]:
df.fillna({1:data[1].mean(),2.0:data[2].mean()})

Unnamed: 0,0,1,2
0,0.938131,6.5,3.0
1,1.234206,6.5,3.0
2,0.082886,6.5,0.972992
3,0.352073,6.5,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


# # Question
Explain what the previous two cells are doing
Create a cell below that replaces Na with the median


In [None]:
df.fillna({1:0.5,2.0:0})
# this cell is replacing the columns 1 null  with .5 and column 2 null with 0

df.fillna({1:data[1].mean(),2.0:data[2].mean()})
# this cell is replacing the columns 1 and 2 with the means of there columns


In [48]:
df.fillna(data.median())

Unnamed: 0,0,1,2
0,0.938131,6.5,3.0
1,1.234206,6.5,3.0
2,0.082886,6.5,0.972992
3,0.352073,6.5,0.476227
4,-0.862708,-1.27888,0.626028
5,-0.476691,-2.298271,-0.057307
6,0.164009,1.674048,-1.153429


## dropping duplicate rows


In [18]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})

In [19]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [20]:
np.sum(data.duplicated())


1

In [21]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


## Replacing values and Sentinels

In many older systems, there are "sentinel" or "marker" values that indicate missing data,   common values used for this are 9999, 99999, -9999, "?", etc.  

In pf.read_csv(),  and probably many other read() operations, we can specify the values interpretted as Na

na_values:  scalar, str, list-like, or dict, optional

Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’.


In [22]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [23]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [24]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Altering Axis indexes



In [51]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [52]:
#Applying a transformation to indices

def trans(x):
    return( x[0:4].upper() )

data.index=data.index.map(trans)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [53]:
data.index

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [54]:
 # This is a function I've been trying to find for a while, handy
# Notice that extra space in the index 'NEW '

data.rename(index={'NEW ': 'NY'},columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
OHIO,0,1,2,3
COLO,4,5,6,7
NY,8,9,10,11


In [29]:
data.rename(index={'NEW ': 'NY'})

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NY,8,9,10,11


In [30]:
## Discretization and Binning of continuous data

In [31]:


ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

#convert to categorical ranges

bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)

cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [32]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [33]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [34]:
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [35]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

# when you create a categorizer,  where are the categories names? Where are the values for each specimen?

In [36]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

# is this set up to indicate an ordered category?

## Dividing data into equally spaced quintiles

qcut produces a desired number of quintile categories

In [37]:
data = np.random.randn(1000)  # Normally distributed

cats = pd.qcut(data, 4)  # Cut into quartiles

cats

[(-0.739, -0.058], (-0.058, 0.586], (-0.058, 0.586], (-0.739, -0.058], (0.586, 2.599], ..., (-0.058, 0.586], (0.586, 2.599], (-3.533, -0.739], (-0.058, 0.586], (-0.739, -0.058]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.533, -0.739] < (-0.739, -0.058] < (-0.058, 0.586] <
                                           (0.586, 2.599]]

In [38]:
pd.value_counts(cats)

(-3.533, -0.739]    250
(-0.739, -0.058]    250
(-0.058, 0.586]     250
(0.586, 2.599]      250
dtype: int64

## Detecting and Filtering Outliers


In [39]:
data = pd.DataFrame(np.random.randn(1000, 4))

data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.014997,0.060281,-0.059198,-0.001795
std,0.98919,1.015809,1.004955,0.985244
min,-3.083088,-3.029529,-2.928654,-3.300762
25%,-0.668784,-0.606776,-0.725134,-0.700622
50%,0.023843,0.045761,-0.060395,-0.030292
75%,0.732336,0.69186,0.605283,0.626219
max,2.995047,3.117042,2.836237,3.374281


In [40]:
# Detecting values with abs value over 3 from column 2

col = data[2]

col[np.abs(col) > 3]

Series([], Name: 2, dtype: float64)

In [41]:
data[(np.abs(data) > 3).any(1)]

  data[(np.abs(data) > 3).any(1)]


Unnamed: 0,0,1,2,3
361,-0.630508,-3.029529,0.574805,-1.100342
514,-0.658345,3.117042,0.643774,-1.200935
665,1.777432,-0.402732,-1.082652,3.374281
792,0.410274,3.04804,0.619944,0.406561
795,-0.089779,3.082354,1.840024,-0.78961
858,-3.083088,-0.040378,0.250507,1.384396
904,0.022528,-0.458991,1.075416,-3.300762
