# Merges and transformations of data

## Index

1. Merge
2. Concatenating along an axis
3. Removing duplicates
4. Renaming indexes
5. Vectorized string operations in pandas

## 1. Merge

df.merge nos permite combinar dataframes haciendo los joins típicos de bases de datos.

In [6]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame({'data1': range(5), 'key':list('abcde')})

In [149]:
df1

Unnamed: 0,data1,key
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [3]:
df2 = pd.DataFrame({'data2': range(7,10), 'key':list('ade')})
df2

Unnamed: 0,data2,key
0,7,a
1,8,d
2,9,e


Los joins normales son con merge, no con join.

In [151]:
df1.merge(df2)  # No tengo que indicar clave. Por defecto un inner join

Unnamed: 0,data1,key,data2
0,0,a,7
1,3,d,8
2,4,e,9


In [152]:
df5 = df1.merge(df2, how = 'left')  # Left join
df5

Unnamed: 0,data1,key,data2
0,0,a,7.0
1,1,b,
2,2,c,
3,3,d,8.0
4,4,e,9.0


In [153]:
df3 = pd.DataFrame({'data2': range(7,10), 'key':list('ada')})
df3

Unnamed: 0,data2,key
0,7,a
1,8,d
2,9,a


In [154]:
df1.merge(df3)

Unnamed: 0,data1,key,data2
0,0,a,7
1,0,a,9
2,3,d,8


In [155]:
df4 = pd.DataFrame({'rkey': list('ddeebbfff'), 'values':range(4,40,4)})
df4

Unnamed: 0,rkey,values
0,d,4
1,d,8
2,e,12
3,e,16
4,b,20
5,b,24
6,f,28
7,f,32
8,f,36


In [156]:
# Si la clave no tiene el mismo nombre

df1.merge(df4, left_on = 'key', right_on = 'rkey', how = 'outer')

Unnamed: 0,data1,key,rkey,values
0,0.0,a,,
1,1.0,b,b,20.0
2,1.0,b,b,24.0
3,2.0,c,,
4,3.0,d,d,4.0
5,3.0,d,d,8.0
6,4.0,e,e,12.0
7,4.0,e,e,16.0
8,,,f,28.0
9,,,f,32.0


Si tengo campos con nombres iguales en ambas tablas, pandas les asignará sufijos

In [65]:
df1['X'] = 42.0
df1

Unnamed: 0,data1,key,X
0,0,a,42.0
1,1,b,42.0
2,2,c,42.0
3,3,d,42.0
4,4,e,42.0


In [48]:
df4['X'] = 37.0

In [50]:
df1.merge(df4, left_on = 'key', right_on = 'rkey', how = 'outer' )

# Me cambia el nombre. Puedo parametrizar esto

Unnamed: 0,data2,key,X_x,rkey,values,X_y
0,7.0,a,37.0,,,
1,8.0,d,37.0,d,4.0,37.0
2,8.0,d,37.0,d,8.0,37.0
3,9.0,e,37.0,e,12.0,37.0
4,9.0,e,37.0,e,16.0,37.0
5,,,,b,20.0,37.0
6,,,,b,24.0,37.0
7,,,,f,28.0,37.0
8,,,,f,32.0,37.0
9,,,,f,36.0,37.0


In [53]:
df1.merge(df4, 
          left_on = 'key', 
          right_on = 'rkey',
          how = 'outer', 
          suffixes = ['_left', '_right'] )

Unnamed: 0,data2,key,X_left,rkey,values,X_right
0,7.0,a,37.0,,,
1,8.0,d,37.0,d,4.0,37.0
2,8.0,d,37.0,d,8.0,37.0
3,9.0,e,37.0,e,12.0,37.0
4,9.0,e,37.0,e,16.0,37.0
5,,,,b,20.0,37.0
6,,,,b,24.0,37.0
7,,,,f,28.0,37.0
8,,,,f,32.0,37.0
9,,,,f,36.0,37.0


Si queremos hacer un merge en base al índice:

In [4]:
df1.merge(df2, left_index=True, right_on='data2', how='outer')

Unnamed: 0,data1,key_x,data2,key_y
2,0.0,a,0,
2,1.0,b,1,
2,2.0,c,2,
2,3.0,d,3,
2,4.0,e,4,
0,,,7,a
1,,,8,d
2,,,9,e


## 2. Concatenating along an axis

By default, side to side. The difference with a numpy array is that pandas will try to align the rows of a DataFrame according to their index values, generating NaNs if necessary.

In [66]:
df1

Unnamed: 0,data1,key,X
0,0,a,42.0
1,1,b,42.0
2,2,c,42.0
3,3,d,42.0
4,4,e,42.0


In [64]:
df2

Unnamed: 0,data2,key,X
0,7,a,37.0
1,8,d,37.0
2,9,e,37.0


In [68]:
pd.concat([df1,df2])  # Los concatena

Unnamed: 0,X,data1,data2,key
0,42.0,0.0,,a
1,42.0,1.0,,b
2,42.0,2.0,,c
3,42.0,3.0,,d
4,42.0,4.0,,e
0,37.0,,7.0,a
1,37.0,,8.0,d
2,37.0,,9.0,e


In [69]:
# Esto es igual que en concat de numpy

import numpy as np

In [71]:
array_1 = np.arange(36).reshape(6,6)
array_1

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

In [75]:
array_2 = np.arange(18).reshape(3,6)
array_3 = np.arange(18).reshape(6,3)

In [77]:
np.concatenate([array_1, array_2])

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35],
       [ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17]])

In [79]:
np.concatenate([array_1, array_2], axis = 1)  # Me da error por las dimensiones

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [81]:
np.concatenate([array_1, array_3], axis = 1)

array([[ 0,  1,  2,  3,  4,  5,  0,  1,  2],
       [ 6,  7,  8,  9, 10, 11,  3,  4,  5],
       [12, 13, 14, 15, 16, 17,  6,  7,  8],
       [18, 19, 20, 21, 22, 23,  9, 10, 11],
       [24, 25, 26, 27, 28, 29, 12, 13, 14],
       [30, 31, 32, 33, 34, 35, 15, 16, 17]])

In [83]:
series_1 = df1['data1']

In [84]:
series_2 = df2['data2']

In [86]:
series_3 = pd.Series(range(3,8,2))

Series will concatenate end to end by default

In [88]:
pd.concat([series_1, series_2, series_3])

0    0
1    1
2    2
3    3
4    4
0    7
1    8
2    9
0    3
1    5
2    7
dtype: int64

In [89]:
pd.concat([series_1, series_2, series_3], axis = 1)

Unnamed: 0,data1,data2,0
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


Ahora me las alinea en base al valor del índice. Con axis = 0, en base al valor de la columna


In [92]:
# Para ignorar índices y nombres de columna

pd.concat([series_1, series_2, series_3], axis = 1, ignore_index = True)

Unnamed: 0,0,1,2
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


In [96]:
# Puedo establecer los nombres de columna que les quiero asignar

pd.concat([series_1, series_2, series_3], axis = 1, keys = ['s1', 's2', 's3'])

Unnamed: 0,s1,s2,s3
0,0,7.0,3.0
1,1,8.0,5.0
2,2,9.0,7.0
3,3,,
4,4,,


## 3. Removing duplicates

In [101]:
df5 = pd.DataFrame({'key1':['one'] * 3 + ['two'] * 4, 'key2': [1,1,2,3,3,4,4]})
df5

Unnamed: 0,key1,key2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [103]:
df5.duplicated()  # Esto me marca los duplicados, pero si tengo 2 sólo me marca el segundo.

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [105]:
df5.drop_duplicates()  # Borra los duplicados

Unnamed: 0,key1,key2
0,one,1
2,one,2
3,two,3
5,two,4


Ojo, si tengo dos registros duplicados, sólo borra uno

Para borrar duplicados basándome en un campo:

In [110]:
df5.drop_duplicates(subset = 'key1')  # Basándome en sólo un campo

Unnamed: 0,key1,key2
0,one,1
3,two,3


In [111]:
df5.drop_duplicates(keep = 'last')

Unnamed: 0,key1,key2
1,one,1
2,one,2
4,two,3
6,two,4


In [113]:
# Combinando subset y keep, podemos hacer cosas interesantes

df5.drop_duplicates(subset = 'key1', keep = 'last')

Unnamed: 0,key1,key2
2,one,2
6,two,4


In [114]:
df5.drop_duplicates(subset = 'key1', keep = 'first')

Unnamed: 0,key1,key2
0,one,1
3,two,3


## 4. Renaming indexes

Renombrar índices (no podemos cambiarlos, son inmutables).

Lo que sí puedo hacer es sustituirlos por otros


In [117]:
df5.index = list('abcdefg')
df5

Unnamed: 0,key1,key2
a,one,1
b,one,1
c,one,2
d,two,3
e,two,3
f,two,4
g,two,4


## 5. Vectorized string operations in pandas

We can access vectorized string operations through the .str attribute of a string Series, such as a column in a dataframe. These operations mimic the classical string methods, but they operate on each element of the Series. We can also slice on .str.

Con strings sabemos usar muchas cosas...

In [118]:
a_string = 'muchas anios despues frente al peloton de fusilamiento...'

a_string.split()

['muchas',
 'anios',
 'despues',
 'frente',
 'al',
 'peloton',
 'de',
 'fusilamiento...']

In [121]:
a_string.capitalize()

'Muchas anios despues frente al peloton de fusilamiento...'

In [122]:
a_string.upper()

'MUCHAS ANIOS DESPUES FRENTE AL PELOTON DE FUSILAMIENTO...'

In [123]:
a_string[:6]

'muchas'

En una columna de tipo string podemos hacer cosas lo mismo, pero con alguna salvedad

In [125]:
df5['animals'] = ['giraffes', 'nakedmolerat', 'bear', 'walrus', 'platypus', 'dog', 'cat']
df5

Unnamed: 0,key1,key2,animals
a,one,1,giraffes
b,one,1,nakedmolerat
c,one,2,bear
d,two,3,walrus
e,two,3,platypus
f,two,4,dog
g,two,4,cat


In [127]:
animals = df5['animals']
animals

a        giraffes
b    nakedmolerat
c            bear
d          walrus
e        platypus
f             dog
g             cat
Name: animals, dtype: object

In [128]:
# Ojo, las columnas string son de tipo objeto

df5.dtypes

key1       object
key2        int64
animals    object
dtype: object

In [130]:
# Si quiero recortar las strings, no puedo hacer:

animals[3]  

# es una serie, me coge el tercer elemento, no los primeros 3 de la string

'walrus'

Para aplicar funciones propias de string a una columna de strings: `str`

In [129]:
animals.str.capitalize()

a        Giraffes
b    Nakedmolerat
c            Bear
d          Walrus
e        Platypus
f             Dog
g             Cat
Name: animals, dtype: object

In [132]:
animals.str.len()

a     8
b    12
c     4
d     6
e     8
f     3
g     3
Name: animals, dtype: int64

In [133]:
# Tengo muchas funciones.

df5[df5['animals'].str.len() == 4]

Unnamed: 0,key1,key2,animals
c,one,2,bear
