## Environment

In [3]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

### Rename axis indexes

In [8]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                  index=['ant','bee','cat'],
                  columns=['one','two','three','four'])

In [9]:
data

Unnamed: 0,one,two,three,four
ant,0,1,2,3
bee,4,5,6,7
cat,8,9,10,11


In [11]:
transform=lambda x: x[:4].upper()  # Define a function that changes indexes into uppercase
data.index.map(transform)

Index(['ANT', 'BEE', 'CAT'], dtype='object')

In [13]:
data.index=data.index.map(transform)
data

Unnamed: 0,one,two,three,four
ANT,0,1,2,3
BEE,4,5,6,7
CAT,8,9,10,11


In [14]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ant,0,1,2,3
Bee,4,5,6,7
Cat,8,9,10,11


In [18]:
data.rename(index={'ANT':'Ant-eater'},
           columns={'three':'two-n-a-half'})

Unnamed: 0,one,two,two-n-a-half,four
Ant-eater,0,1,2,3
BEE,4,5,6,7
CAT,8,9,10,11


In [19]:
data.rename(index={'ANT':'Antelope'}, inplace=True)
data

Unnamed: 0,one,two,three,four
Antelope,0,1,2,3
BEE,4,5,6,7
CAT,8,9,10,11


### Discretization and Binning

In [37]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # alternatively can use ages=np.random.randint(20,35,size=12)
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [39]:
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins) #categories with open lower but closed upper limits
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [40]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [41]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [42]:
pd.value_counts(cats) # frequency coutns

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [43]:
pd.cut(ages,[18,26,36,61,100],right=False) #categories with closed lower but open upper limits

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [44]:
group_names=['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [46]:
data=np.random.rand(20) #uniform random 20 numbers
data

array([0.8374, 0.3832, 0.2988, 0.0063, 0.4376, 0.7379, 0.3758, 0.4932,
       0.014 , 0.2494, 0.3471, 0.7967, 0.9384, 0.1005, 0.7354, 0.9764,
       0.7059, 0.9518, 0.9278, 0.41  ])

In [48]:
pd.cut(data, 4, precision=2)

[(0.73, 0.98], (0.25, 0.49], (0.25, 0.49], (0.0053, 0.25], (0.25, 0.49], ..., (0.73, 0.98], (0.49, 0.73], (0.73, 0.98], (0.73, 0.98], (0.25, 0.49]]
Length: 20
Categories (4, interval[float64]): [(0.0053, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.98]]

In [52]:
data=np.random.randn(1000) #1000 random normal numbers
cats=pd.qcut(data,4) #Group into quartiles
cats
pd.value_counts(cats)

(0.663, 3.389]       250
(-0.0374, 0.663]     250
(-0.673, -0.0374]    250
(-3.665, -0.673]     250
dtype: int64

In [57]:
newcats=pd.qcut(data,[0,0.2,0.5,0.8,1.])
newcats

[(-0.863, -0.0374], (-0.863, -0.0374], (-3.665, -0.863], (0.83, 3.389], (0.83, 3.389], ..., (0.83, 3.389], (-0.863, -0.0374], (-0.0374, 0.83], (-3.665, -0.863], (-0.0374, 0.83]]
Length: 1000
Categories (4, interval[float64]): [(-3.665, -0.863] < (-0.863, -0.0374] < (-0.0374, 0.83] < (0.83, 3.389]]

In [58]:
pd.value_counts(newcats)

(-0.0374, 0.83]      300
(-0.863, -0.0374]    300
(0.83, 3.389]        200
(-3.665, -0.863]     200
dtype: int64

### Identify and work with outliers

In [60]:
data=pd.DataFrame(np.random.randn(1000, 4)) #data matrix with thousand rows and 4 columns
data.describe() #summary statistics

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.060144,-0.009583,0.004146,-0.020177
std,1.025283,0.991398,1.010832,1.009819
min,-3.06743,-3.423739,-2.944923,-3.094371
25%,-0.756485,-0.671014,-0.674305,-0.688851
50%,-0.056539,0.032223,-0.015343,-0.02683
75%,0.651537,0.66197,0.673732,0.67298
max,3.571767,3.424722,3.893606,4.104784


In [62]:
col=data[2]
col[np.abs(col)>3] #find values greater than 3 in column 2

447    3.354485
583    3.893606
Name: 2, dtype: float64

In [65]:
data[(np.abs(data)>3).any(1)] #report rows that have >3 in any column

Unnamed: 0,0,1,2,3
222,0.42982,-0.247168,-1.145995,4.104784
380,-3.06743,0.043376,0.709777,-1.326205
412,-0.008728,-3.423739,1.061722,-0.398055
447,-1.975929,1.117683,3.354485,-1.824912
460,0.337453,-1.199839,-0.140934,3.216015
561,3.571767,-0.080974,-0.362215,-1.887861
571,1.186184,3.162137,-1.811221,-0.295279
583,-0.477607,0.101242,3.893606,1.048426
629,0.111325,-0.379214,0.862023,-3.094371
977,-3.019376,-0.534652,1.155369,1.047623


In [73]:
data[np.abs(data)>3]=np.sign(data)*3  #replace outliers with 3 (keep the sign)
data

Unnamed: 0,0,1,2,3
0,0.405399,2.695794,0.564636,1.593455
1,0.885846,0.324446,0.606096,0.915896
2,-1.030575,-1.402759,-0.910587,-0.956956
3,0.270255,-2.908266,0.460448,-2.941183
4,1.430784,0.694300,-0.236944,-0.588769
5,0.675679,-0.896593,-0.928476,-0.582420
6,-0.465101,-0.484181,-2.008272,1.902356
7,-0.075422,-1.194952,-0.955007,0.748794
8,0.434854,-1.295906,-0.060320,0.763275
9,-1.033090,0.277089,-0.866665,0.682160


In [74]:
np.sign(data).head() #check the sign only

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,-1.0,-1.0,-1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,1.0,-1.0,-1.0


### Permutation and random sampling from the data

In [80]:
df=pd.DataFrame(np.arange(5*4).reshape(5,4)) #create 5*4=20 consecutive values then reshape them to 5x4 matrix
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [86]:
sampler=np.random.permutation(5) #random arrangements of rows
sampler

array([4, 1, 0, 3, 2])

In [87]:
df.take(sampler) #arrange rows according to sampler

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11


In [89]:
df.sample(n=3) #now sample the first three rows

Unnamed: 0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


In [93]:
# Another example: randomly choose values with replacement
choices=pd.Series([5,7,-1,6,4]) #some random values
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [94]:
draws=choices.sample(n=10, replace=True)
draws

0    5
1    7
0    5
4    4
4    4
3    6
4    4
4    4
4    4
4    4
dtype: int64