# Modulo 30:Tratamiento de datos con DataFrames

### Unión de DataFrames

In [1]:
import pandas as pd

In [2]:
dataframe1 = pd.DataFrame({'c1':['1','2','3'],'clave':['a','b','c']})

In [3]:
dataframe1

Unnamed: 0,c1,clave
0,1,a
1,2,b
2,3,c


In [4]:
dataframe2 = pd.DataFrame({'c2':['4','5','6'], 'clave':['c','b','e']})

In [5]:
dataframe2

Unnamed: 0,c2,clave
0,4,c
1,5,b
2,6,e


In [6]:
dataframe3 = pd.DataFrame.merge(dataframe1,dataframe2) #El método merge, une dataframes

#Como la única columna que coincide en ambos dataframes es "Clave", los únirá por esa columna:

In [7]:
dataframe3

Unnamed: 0,c1,clave,c2
0,2,b,5
1,3,c,4


In [8]:
#Sólo une aquellos registros en los que los valores de la columna "Clave" son coincidentes

In [10]:
dataframe4 = pd.DataFrame.merge(dataframe1,dataframe2, on='clave') #Podemos seleccionar la columna por la que hacer la union
dataframe4

Unnamed: 0,c1,clave,c2
0,2,b,5
1,3,c,4


In [11]:
#Pero si queremos que, por ejemplo, prevalezca el dataframe1 y le añada los valores del dataframe2:

In [13]:
dataframe5 = pd.DataFrame.merge(dataframe1,dataframe2,on='clave',how='left')

In [14]:
dataframe5

Unnamed: 0,c1,clave,c2
0,1,a,
1,2,b,5.0
2,3,c,4.0


In [15]:
dataframe6 = pd.DataFrame.merge(dataframe1,dataframe2,on='clave',how='right')
dataframe6

Unnamed: 0,c1,clave,c2
0,3.0,c,4
1,2.0,b,5
2,,e,6


In [16]:
dataframe7 = pd.DataFrame.merge(dataframe1,dataframe2,on='clave',how='outer') #Unión completa, junta todo
dataframe7

Unnamed: 0,c1,clave,c2
0,1.0,a,
1,2.0,b,5.0
2,3.0,c,4.0
3,,e,6.0


### Concatenación de datos

   ##### Arrays

In [17]:
import pandas as pd
import numpy as np

In [18]:
array1 = np.arange(9).reshape(3,3)

In [21]:
np.concatenate([array1,array1]) #Concatenamos el mismo porque solo tenemos uno

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [22]:
np.concatenate([array1,array1],axis=1) #Para concatenarlo hacia la derecha

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

In [23]:
#OJO! las dimensiones de los array deben coincidir

   ##### Series

In [24]:
serie1 = pd.Series([1,2,3],index=['a','b','c'])

In [25]:
serie2 = pd.Series([4,5,6],index=['d','e','f'])

In [26]:
serie3 = pd.concat([serie1,serie2])

In [27]:
serie3

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [28]:
serie1

a    1
b    2
c    3
dtype: int64

In [29]:
serie2

d    4
e    5
f    6
dtype: int64

In [30]:
serie4 = pd.concat([serie1,serie2],axis=1)

In [31]:
serie4

Unnamed: 0,0,1
a,1.0,
b,2.0,
c,3.0,
d,,4.0
e,,5.0
f,,6.0


In [34]:
serie5 = pd.Series([4,5,6],index=['c','e','f'])

In [35]:
pd.concat([serie1,serie5],axis=1)

Unnamed: 0,0,1
a,1.0,
b,2.0,
c,3.0,4.0
e,,5.0
f,,6.0


In [36]:
serie6 = pd.concat([serie1,serie2],keys=['serie1','serie2'])

In [37]:
serie6

serie1  a    1
        b    2
        c    3
serie2  d    4
        e    5
        f    6
dtype: int64

   ##### DataFrames

In [39]:
dataframe1 = pd.DataFrame(np.random.rand(3,3), columns=['a','b','c'])
dataframe1

Unnamed: 0,a,b,c
0,0.088247,0.581947,0.901061
1,0.94448,0.715128,0.062383
2,0.969457,0.558633,0.979228


In [40]:
dataframe2 = pd.DataFrame(np.random.rand(3,3), columns=['a','b','c'])
dataframe2

Unnamed: 0,a,b,c
0,0.181473,0.463423,0.751472
1,0.640612,0.281877,0.251266
2,0.814603,0.780845,0.463946


In [41]:
dataframe3 = pd.concat([dataframe1,dataframe2])

In [42]:
dataframe3

Unnamed: 0,a,b,c
0,0.088247,0.581947,0.901061
1,0.94448,0.715128,0.062383
2,0.969457,0.558633,0.979228
0,0.181473,0.463423,0.751472
1,0.640612,0.281877,0.251266
2,0.814603,0.780845,0.463946


In [None]:
#Vemos que ha mantenido los índices de los DataFrame originales, lo modificamos:

In [43]:
dataframe3 = pd.concat([dataframe1,dataframe2],ignore_index=True)
dataframe3

Unnamed: 0,a,b,c
0,0.088247,0.581947,0.901061
1,0.94448,0.715128,0.062383
2,0.969457,0.558633,0.979228
3,0.181473,0.463423,0.751472
4,0.640612,0.281877,0.251266
5,0.814603,0.780845,0.463946


### Combinar Series y DataFrames

In [44]:
serie1 = pd.Series([1,2,np.nan])
serie1

0    1.0
1    2.0
2    NaN
dtype: float64

In [45]:
serie2 = pd.Series([4,5,6])

In [46]:
#Combinamos las series 1 y 3
serie3 = serie1.combine_first(serie2)
serie3

0    1.0
1    2.0
2    6.0
dtype: float64

In [47]:
#Así lo que hace es tomar la serie1 pero cambiando los valores nulos por los correspondiente en la 2ª serie

In [48]:
#Repetimos para Dataframes:

In [49]:
dataframe1 = pd.DataFrame([1,2,np.nan])

In [50]:
dataframe1

Unnamed: 0,0
0,1.0
1,2.0
2,


In [51]:
dataframe2 = pd.DataFrame([4,5,6])

In [52]:
dataframe3 =dataframe1.combine_first(dataframe2)

In [53]:
dataframe3

Unnamed: 0,0
0,1.0
1,2.0
2,6.0


### Eliminar duplicados en DataFrames

In [55]:
valores = [[1,2],[1,2],[5,6],[5,8]]

In [56]:
indices = list('mnop')

In [57]:
columnas = ['valor1','valor2']

In [58]:
dataframe = pd.DataFrame(valores, index=indices, columns = columnas)
dataframe

Unnamed: 0,valor1,valor2
m,1,2
n,1,2
o,5,6
p,5,8


In [61]:
#Utilizaremos el metodo Drop_duplicates

In [64]:
dataframe2 = dataframe.drop_duplicates()
dataframe2

Unnamed: 0,valor1,valor2
m,1,2
o,5,6
p,5,8


In [65]:
dataframe2.drop_duplicates('valor1')

Unnamed: 0,valor1,valor2
m,1,2
o,5,6


In [66]:
#Vemos que se queda con la primera fila que ve, si queremos coger el último en vez del primero:

In [67]:
dataframe2.drop_duplicates('valor1',keep='last')

Unnamed: 0,valor1,valor2
m,1,2
p,5,8


### Reemplazar datos en series

In [68]:
serie = pd.Series([1,2,3,4,5],index=list('abcde'))

In [69]:
serie

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [70]:
#Si queremos cambiar el valor "1" por un "8"

In [71]:
serie.replace(1,8)

a    8
b    2
c    3
d    4
e    5
dtype: int64

In [73]:
#También se puede hacer mediante un diccionario (cuando hay más valores)
serie = serie.replace({2:7,3:9})
serie

a    1
b    7
c    9
d    4
e    5
dtype: int64

### Renombrar índices

In [74]:
valores = np.arange(9).reshape(3,3)

In [75]:
indices = list('abc')

In [76]:
dataframe = pd.DataFrame(valores, index=indices)

In [77]:
dataframe

Unnamed: 0,0,1,2
a,0,1,2
b,3,4,5
c,6,7,8


In [78]:
#Queremos cambiar el índice a letras en mayusculas:

In [81]:
nuevos_indices = dataframe.index.map(str.upper)
nuevos_indices

Index(['A', 'B', 'C'], dtype='object')

In [82]:
dataframe.index = nuevos_indices
dataframe

Unnamed: 0,0,1,2
A,0,1,2
B,3,4,5
C,6,7,8


In [83]:
#Otra forma de hacerlo es con la función "rename"

In [84]:
dataframe = dataframe.rename(index=str.lower)

In [85]:
dataframe

Unnamed: 0,0,1,2
a,0,1,2
b,3,4,5
c,6,7,8


In [87]:
dataframe = dataframe.rename(index={'a':'z','b':'w','c':'y'})
dataframe

Unnamed: 0,0,1,2
z,0,1,2
w,3,4,5
y,6,7,8


### Agrupar datos en categorias

In [95]:
precios = [42,55,48,22,5,21,89,32,26]
rango = np.arange(10,101,10)

In [96]:
rango

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [97]:
precios_rango = pd.cut(precios,rango)
precios_rango

[(40.0, 50.0], (50.0, 60.0], (40.0, 50.0], (20.0, 30.0], NaN, (20.0, 30.0], (80.0, 90.0], (30.0, 40.0], (20.0, 30.0]]
Categories (9, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [98]:
pd.value_counts(precios_rango)

(20, 30]     3
(40, 50]     2
(80, 90]     1
(50, 60]     1
(30, 40]     1
(90, 100]    0
(70, 80]     0
(60, 70]     0
(10, 20]     0
dtype: int64

In [100]:
pd.value_counts(precios_rango).sort_values

<bound method Series.sort_values of (20, 30]     3
(40, 50]     2
(80, 90]     1
(50, 60]     1
(30, 40]     1
(90, 100]    0
(70, 80]     0
(60, 70]     0
(10, 20]     0
dtype: int64>

### Filtrar datos en DataFrames

In [110]:
import math as mt
valores = 100*np.random.rand(10,3)
valores

array([[21.82914971, 45.39913222, 84.86445779],
       [14.90577264, 56.50145294, 77.89761819],
       [19.93843293, 77.00790958, 57.40551849],
       [47.81659651,  9.56174829, 27.95177517],
       [77.07553822, 93.72495283,  1.03555174],
       [88.48682751, 29.98822211, 54.67501689],
       [36.0502357 , 82.54051648, 20.80262728],
       [83.60055317,  3.20172861, 28.88935886],
       [82.55405439, 37.77559689, 95.93799936],
       [22.89553918, 53.77273024,  6.68987202]])

In [113]:
dataframe = pd.DataFrame(valores)
dataframe

Unnamed: 0,0,1,2
0,21.82915,45.399132,84.864458
1,14.905773,56.501453,77.897618
2,19.938433,77.00791,57.405518
3,47.816597,9.561748,27.951775
4,77.075538,93.724953,1.035552
5,88.486828,29.988222,54.675017
6,36.050236,82.540516,20.802627
7,83.600553,3.201729,28.889359
8,82.554054,37.775597,95.937999
9,22.895539,53.77273,6.689872


In [114]:
columna = dataframe[0]

In [115]:
columna

0    21.829150
1    14.905773
2    19.938433
3    47.816597
4    77.075538
5    88.486828
6    36.050236
7    83.600553
8    82.554054
9    22.895539
Name: 0, dtype: float64

In [116]:
columna[columna>40]

3    47.816597
4    77.075538
5    88.486828
7    83.600553
8    82.554054
Name: 0, dtype: float64

In [117]:
dataframe[dataframe[0]>40]

Unnamed: 0,0,1,2
3,47.816597,9.561748,27.951775
4,77.075538,93.724953,1.035552
5,88.486828,29.988222,54.675017
7,83.600553,3.201729,28.889359
8,82.554054,37.775597,95.937999


In [118]:
dataframe[dataframe>40]

Unnamed: 0,0,1,2
0,,45.399132,84.864458
1,,56.501453,77.897618
2,,77.00791,57.405518
3,47.816597,,
4,77.075538,93.724953,
5,88.486828,,54.675017
6,,82.540516,
7,83.600553,,
8,82.554054,,95.937999
9,,53.77273,


### Permutaciones de elementos

In [119]:
valores = np.arange(25).reshape(5,5)
valores

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [120]:
dataframe = pd.DataFrame(valores)

In [121]:
dataframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [124]:
combinacion_aleatoria = np.random.permutation(5)  #Genera un array de 5 elementos comenzando en 0 pero ordenados de forma aleatoria
combinacion_aleatoria

array([2, 0, 4, 1, 3])

In [125]:
dataframe.take(combinacion_aleatoria) #Ordena siguien el índice del array que le hemos dado

Unnamed: 0,0,1,2,3,4
2,10,11,12,13,14
0,0,1,2,3,4
4,20,21,22,23,24
1,5,6,7,8,9
3,15,16,17,18,19


### Agrupación en DataFrames

In [126]:
valores = {'clave1':['x','x','y','y','z'],'clave2':['a','b','a','b','a'],
          'datos1':np.random.rand(5),'datos2':np.random.rand(5)}

In [127]:
valores

{'clave1': ['x', 'x', 'y', 'y', 'z'],
 'clave2': ['a', 'b', 'a', 'b', 'a'],
 'datos1': array([0.59785196, 0.39827171, 0.75616954, 0.28249429, 0.77035271]),
 'datos2': array([0.81478282, 0.27777806, 0.17412396, 0.93508016, 0.47580743])}

In [128]:
dataframe = pd.DataFrame(valores)

In [129]:
dataframe

Unnamed: 0,clave1,clave2,datos1,datos2
0,x,a,0.597852,0.814783
1,x,b,0.398272,0.277778
2,y,a,0.75617,0.174124
3,y,b,0.282494,0.93508
4,z,a,0.770353,0.475807


In [130]:
#Imaginemos que queremos agrupar datos1 por la clave1:

In [134]:
grupo1 = dataframe['datos1'].groupby(dataframe['clave1'])

In [135]:
grupo1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fd23ab3d370>

In [136]:
grupo1.mean()

clave1
x    0.498062
y    0.519332
z    0.770353
Name: datos1, dtype: float64

In [137]:
valores = [[1,2,3],[4,5,6],[7,8,9],[np.nan,np.nan,np.nan]]
valores

[[1, 2, 3], [4, 5, 6], [7, 8, 9], [nan, nan, nan]]

In [138]:
columnas = list('abc')

In [139]:
dataframe = pd.DataFrame(valores, columns = columnas)

In [140]:
dataframe

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,8.0,9.0
3,,,


In [141]:
#Vamos a agrupar para obtener un valor:

In [143]:
dataframe.agg(['sum','min','mean']) #Omite la fila de NaN

Unnamed: 0,a,b,c
sum,12.0,15.0,18.0
min,1.0,2.0,3.0
mean,4.0,5.0,6.0


In [144]:
dataframe.agg(['sum','min','mean'],axis=1) #Lo hacemos por el otro eje

Unnamed: 0,sum,min,mean
0,6.0,1.0,2.0
1,15.0,4.0,5.0
2,24.0,7.0,8.0
3,0.0,,
