## Data Wrangling: Clean, Transform, Merge, Reshape

In [1]:
import pandas as pd

## Combining and merging data sets

### Database-style DataFrame merges

In [18]:
df1 = pd.DataFrame({'data1': range(7),'key':list('bbacaab')})
df2 = pd.DataFrame({'data2': range(20,23),'key':list('abd')})

In [19]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [20]:
df2

Unnamed: 0,data2,key
0,20,a
1,21,b
2,22,d


In [21]:
df1.merge(df2)

Unnamed: 0,data1,key,data2
0,0,b,21
1,1,b,21
2,6,b,21
3,2,a,20
4,4,a,20
5,5,a,20


In [6]:
df3 = pd.DataFrame({'data1': range(7),'lkey':list('bbacaab')})
df4 = pd.DataFrame({'data2': range(3),'rkey':list('abd')})

In [7]:
df3.merge(df4, left_on='lkey',right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [8]:
df1.merge(df2,how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [9]:
df1.merge(df2,how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,1,b,1.0
2,2,a,0.0
3,3,c,
4,4,a,0.0
5,5,a,0.0
6,6,b,1.0


In [10]:
df1['X'] = 2

In [11]:
df1

Unnamed: 0,data1,key,X
0,0,b,2
1,1,b,2
2,2,a,2
3,3,c,2
4,4,a,2
5,5,a,2
6,6,b,2


In [12]:
df2['X'] = 42

In [13]:
df1.merge(df2, on = 'key',suffixes = ['_left_','_right_'])

Unnamed: 0,data1,key,X_left_,data2,X_right_
0,0,b,2,1,42
1,1,b,2,1,42
2,6,b,2,1,42
3,2,a,2,0,42
4,4,a,2,0,42
5,5,a,2,0,42


### Merging on index

In [14]:
df5 = pd.DataFrame({'g' : range(4),'h': range(8,12)},index = list('abcd'))
df5

Unnamed: 0,g,h
a,0,8
b,1,9
c,2,10
d,3,11


In [15]:
df1.merge(df5, left_on='key',right_index=True)

Unnamed: 0,data1,key,X,g,h
0,0,b,2,1,9
1,1,b,2,1,9
6,6,b,2,1,9
2,2,a,2,0,8
4,4,a,2,0,8
5,5,a,2,0,8
3,3,c,2,2,10


### Concatenating along an axis

In [16]:
pd.concat([df1,df5])

Unnamed: 0,X,data1,g,h,key
0,2.0,0.0,,,b
1,2.0,1.0,,,b
2,2.0,2.0,,,a
3,2.0,3.0,,,c
4,2.0,4.0,,,a
5,2.0,5.0,,,a
6,2.0,6.0,,,b
a,,,0.0,8.0,
b,,,1.0,9.0,
c,,,2.0,10.0,


In [26]:
import numpy as np

a1 = np.arange(0,24).reshape(4,6)
a1

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])

In [27]:
a2 = np.arange(25,37).reshape(4,3)
a2

array([[25, 26, 27],
       [28, 29, 30],
       [31, 32, 33],
       [34, 35, 36]])

In [29]:
a3 = np.concatenate([a1,a2], axis = 1)
a3

array([[ 0,  1,  2,  3,  4,  5, 25, 26, 27],
       [ 6,  7,  8,  9, 10, 11, 28, 29, 30],
       [12, 13, 14, 15, 16, 17, 31, 32, 33],
       [18, 19, 20, 21, 22, 23, 34, 35, 36]])

In [30]:
s1 = pd.Series(range(4), index=list('abcd'))
s2 = pd.Series(range(10,13), index=list('lmn'))
s3 = pd.Series(range(40,43), index=list('xyz'))
s1,s2,s3

(a    0
 b    1
 c    2
 d    3
 dtype: int64, l    10
 m    11
 n    12
 dtype: int64, x    40
 y    41
 z    42
 dtype: int64)

In [31]:
pd.concat([s1,s2,s3])

a     0
b     1
c     2
d     3
l    10
m    11
n    12
x    40
y    41
z    42
dtype: int64

In [35]:
result = pd.concat([s1,s2,s3],axis = 1)
result

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,2.0,,
d,3.0,,
l,,10.0,
m,,11.0,
n,,12.0,
x,,,40.0
y,,,41.0
z,,,42.0


In [34]:
result.columns

RangeIndex(start=0, stop=3, step=1)

In [36]:
result = pd.concat([s1,s2,s3],axis = 1,keys = ['s1','s2','s3'])
result

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,2.0,,
d,3.0,,
l,,10.0,
m,,11.0,
n,,12.0,
x,,,40.0
y,,,41.0
z,,,42.0


In [42]:
pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,data1,data2,key
0,0.0,,b
1,1.0,,b
2,2.0,,a
3,3.0,,c
4,4.0,,a
5,5.0,,a
6,6.0,,b
7,,20.0,a
8,,21.0,b
9,,22.0,d


In case that we have problems with a module or if I am changing something, I cannot import again the same module, so I would need to reload again the module

In [43]:
import imp
imp.reload(pd)

<module 'pandas' from '/home/dsc/anaconda3/lib/python3.6/site-packages/pandas/__init__.py'>

## Data transformation

### Removing duplicates

In [45]:
df6 = pd.DataFrame({'key1' : 'one' * 3 + 'two' * 2, 
                   'key2' : [1, 1, 2, 3, 3, 4, 4]})
df6

Unnamed: 0,key1,key2
0,oneoneonetwotwo,1
1,oneoneonetwotwo,1
2,oneoneonetwotwo,2
3,oneoneonetwotwo,3
4,oneoneonetwotwo,3
5,oneoneonetwotwo,4
6,oneoneonetwotwo,4


In [46]:
df6.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [47]:
df6.drop_duplicates()

Unnamed: 0,key1,key2
0,oneoneonetwotwo,1
2,oneoneonetwotwo,2
3,oneoneonetwotwo,3
5,oneoneonetwotwo,4


It has considered the first one as the original and the second as the duplicate. We can change that behaviour:

In [49]:
df6.drop_duplicates(keep = 'last')

Unnamed: 0,key1,key2
1,oneoneonetwotwo,1
2,oneoneonetwotwo,2
4,oneoneonetwotwo,3
6,oneoneonetwotwo,4


### Renaming axis indexes

In [52]:
df6.index = list('plfjdmh')
df6

Unnamed: 0,key1,key2
p,oneoneonetwotwo,1
l,oneoneonetwotwo,1
f,oneoneonetwotwo,2
j,oneoneonetwotwo,3
d,oneoneonetwotwo,3
m,oneoneonetwotwo,4
h,oneoneonetwotwo,4


### Discretization and binning

In [53]:
ages = [18,25,22,45,91.67,20,38,38,56]

In [55]:
bins = [18,25,35,65,100]

cuts = pd.cut(ages,bins)
cuts

[NaN, (18, 25], (18, 25], (35, 65], (65, 100], (18, 25], (35, 65], (35, 65], (35, 65]]
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 65] < (65, 100]]

This function gives intervals and binning for the sample in a list

In [56]:
cuts.value_counts()

(18, 25]     3
(25, 35]     0
(35, 65]     4
(65, 100]    1
dtype: int64

## String manipulation

### String object methods

In [58]:
string1 = 'this is some sentence'
string1.split()

['this', 'is', 'some', 'sentence']

### Vectorized string functions in pandas

In [65]:
animals = 'rhino giraffe molerat mantisshrimp cheetah mosquito whale'.split()
animals

['rhino', 'giraffe', 'molerat', 'mantisshrimp', 'cheetah', 'mosquito', 'whale']

In [66]:
list(map(lambda st: st.capitalize(),animals))

['Rhino', 'Giraffe', 'Molerat', 'Mantisshrimp', 'Cheetah', 'Mosquito', 'Whale']

In [67]:
df1['animal'] = animals
df1

Unnamed: 0,data1,key,animal
0,0,b,rhino
1,1,b,giraffe
2,2,a,molerat
3,3,c,mantisshrimp
4,4,a,cheetah
5,5,a,mosquito
6,6,b,whale


In [70]:
animals_series = df1['animal']
animals_series.str

<pandas.core.strings.StringMethods at 0x7f78684861d0>

In [75]:
animals_series.str.upper()

0           RHINO
1         GIRAFFE
2         MOLERAT
3    MANTISSHRIMP
4         CHEETAH
5        MOSQUITO
6           WHALE
Name: animal, dtype: object

In [76]:
animals_series.str.len()

0     5
1     7
2     7
3    12
4     7
5     8
6     5
Name: animal, dtype: int64

In [77]:
df1[animals_series.str.contains('m')]

Unnamed: 0,data1,key,animal
2,2,a,molerat
3,3,c,mantisshrimp
5,5,a,mosquito
