# Programación imprescindible para tratamiento de tablas

Este notebook es una recopilación de los principales comandos más útiles y usados para tratamiento de tablas.

In [1]:
import pandas as pd
import numpy as np
import requests

### Importamos una tabla para trabajar con ella

Descargamos la tabla `honeyproduction` de kaggle: 

https://www.kaggle.com/jessicali9530/honey-production/version/2

Los campos son:

* `numcol`: Number of honey producing colonies. Honey producing colonies are the maximum number of colonies from which honey was taken during the year. It is possible to take honey from colonies which did not survive the entire year
* `yieldpercol`: Honey yield per colony. Unit is pounds
* `totalprod`: Total production (numcol x yieldpercol). Unit is pounds
* `stocks`: Refers to stocks held by producers. Unit is pounds
* `priceperlb`: Refers to average price per pound based on expanded sales. Unit is dollars.
* `prodvalue`: Value of production (totalprod x priceperlb). Unit is dollars.

In [2]:
path = 'honeyproduction.csv'
df = pd.read_csv(path)

## 1. Exploración de la tabla

In [3]:
# Cabecera

df.head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998
1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998
2,AR,53000.0,65,3445000.0,1688000.0,0.59,2033000.0,1998


In [4]:
# Número de registros

len(df)

626

In [5]:
# Columnas y número de columnas
print(df.columns)
print(len(df.columns))

Index(['state', 'numcol', 'yieldpercol', 'totalprod', 'stocks', 'priceperlb',
       'prodvalue', 'year'],
      dtype='object')
8


In [6]:
# Descripción (sólo para variables numéricas)

df.describe()

Unnamed: 0,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year
count,626.0,626.0,626.0,626.0,626.0,626.0,626.0
mean,60284.345048,62.009585,4169086.0,1318859.0,1.409569,4715741.0,2004.864217
std,91077.087231,19.458754,6883847.0,2272964.0,0.638599,7976110.0,4.317306
min,2000.0,19.0,84000.0,8000.0,0.49,162000.0,1998.0
25%,9000.0,48.0,475000.0,143000.0,0.9325,759250.0,2001.0
50%,26000.0,60.0,1533000.0,439500.0,1.36,1841500.0,2005.0
75%,63750.0,74.0,4175250.0,1489500.0,1.68,4703250.0,2009.0
max,510000.0,136.0,46410000.0,13800000.0,4.15,69615000.0,2012.0


In [7]:
# Transponer

df2 = df.T
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,616,617,618,619,620,621,622,623,624,625
state,AL,AZ,AR,CA,CO,FL,GA,HI,ID,IL,...,SD,TN,TX,UT,VT,VA,WA,WV,WI,WY
numcol,16000,55000,53000,450000,27000,230000,75000,8000,120000,9000,...,260000,6000,92000,25000,4000,4000,62000,6000,60000,50000
yieldpercol,71,60,65,83,72,98,56,118,50,71,...,63,61,52,38,60,41,41,48,69,51
totalprod,1.136e+06,3.3e+06,3.445e+06,3.735e+07,1.944e+06,2.254e+07,4.2e+06,944000,6e+06,639000,...,1.638e+07,366000,4.784e+06,950000,240000,164000,2.542e+06,288000,4.14e+06,2.55e+06
stocks,159000,1.485e+06,1.688e+06,1.2326e+07,1.594e+06,4.508e+06,307000,66000,2.22e+06,204000,...,3.604e+06,59000,718000,209000,53000,23000,1.017e+06,95000,1.863e+06,459000


## 2. Filtrado y selección

In [8]:
# Acceder a una columna (o a ciertos elementos de una columna)

# df['numcol']
df['numcol'][:5]

0     16000.0
1     55000.0
2     53000.0
3    450000.0
4     27000.0
Name: numcol, dtype: float64

In [9]:
# Acceder a más de una columna

df[['numcol','yieldpercol']].head(3)

Unnamed: 0,numcol,yieldpercol
0,16000.0,71
1,55000.0,60
2,53000.0,65


In [10]:
# Crear una columna nueva e inicializarla al valor que queramos

df['new_col'] = 0

df.head(2)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0


In [17]:
# Filtrar por filas con una condición

df[df['numcol'] == 16000].head(2)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
12,KS,16000.0,46,736000.0,390000.0,0.87,640000.0,1998,0


In [18]:
# Filtrado por una condición más compleja

df[(df['state'] > 'AL') & (df['totalprod'] > 1000)].head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0
2,AR,53000.0,65,3445000.0,1688000.0,0.59,2033000.0,1998,0
3,CA,450000.0,83,37350000.0,12326000.0,0.62,23157000.0,1998,0


In [22]:
# Filtrado por una condición más compleja (II)

state_AL = (df['state'] == 'AL')
big = df['numcol'] > 100000

df[state_AL & big].head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
3,AL,450000.0,83,37350000.0,12326000.0,0.62,23157000.0,1998,0
5,AL,230000.0,98,22540000.0,4508000.0,0.64,14426000.0,1998,0
8,AL,120000.0,50,6000000.0,2220000.0,0.65,3900000.0,1998,0


In [12]:
# Filtrar por filas por el número de índice

df2 = df.loc[0:2]
df2

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0
2,AR,53000.0,65,3445000.0,1688000.0,0.59,2033000.0,1998,0


In [13]:
# Para esto a veces puede venir bien 

df3=df2.reset_index()
df3

Unnamed: 0,index,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
1,1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0
2,2,AR,53000.0,65,3445000.0,1688000.0,0.59,2033000.0,1998,0


In [14]:
# Filtrar por posición

df.iloc[-3:]  # Las tres últimas

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
623,WV,6000.0,48,288000.0,95000.0,2.91,838000.0,2012,0
624,WI,60000.0,69,4140000.0,1863000.0,2.05,8487000.0,2012,0
625,WY,50000.0,51,2550000.0,459000.0,1.87,4769000.0,2012,0


In [15]:
# Borrar una columna (no altera el df original, a no ser que utilice inplace = True)

df2 = df.drop('state', axis = 1)
df2.head(2)

Unnamed: 0,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
1,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0


In [16]:
# Borrar una fila por el índice
df.drop([2]).head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,0
1,AZ,55000.0,60,3300000.0,1485000.0,0.64,2112000.0,1998,0
3,CA,450000.0,83,37350000.0,12326000.0,0.62,23157000.0,1998,0


## 3. Aplicación de funciones

In [45]:
# Operación global a una o más columnas (apply)

df4 = df[['numcol', 'totalprod']]

df4.apply(lambda series: series.mean())

numcol       6.028435e+04
totalprod    4.169086e+06
dtype: float64

In [46]:
# También se puede con una función normal

def suma(series):
    return series.sum()

df4.apply(suma)

numcol       3.773800e+07
totalprod    2.609848e+09
dtype: float64

In [48]:
# Operación a cada elemento de una tabla (applymap)

df4.applymap(lambda element: element**2).head(2)


Unnamed: 0,numcol,totalprod
0,256000000.0,1290496000000.0
1,3025000000.0,10890000000000.0


In [59]:
# Operación a cada elemento de una columna (map)

df['numcol_K'] = df['numcol']
df['numcol_K'] = df['numcol_K'].map(lambda element: element/1000)
df.head(2)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col,numcol_K
0,AL,16000.0,71.0,1136000.0,159000.0,0.72,818000.0,1998,0,16.0
1,AL,55000.0,60.0,3300000.0,1485000.0,0.64,2112000.0,1998,0,55.0


In [67]:
# Cambio de formato de una columna en una tabla: de número a texto

df['yieldpercol'] = df['yieldpercol'].map(lambda element: '%s' % element)

type(df['yieldpercol'][0])

str

## 4. Ordenación

In [70]:
# Ordenar columnas

df.sort_index(axis = 1).head(2)

Unnamed: 0,new_col,numcol,numcol_K,priceperlb,prodvalue,state,stocks,totalprod,year,yieldpercol
0,0,16000.0,16.0,0.72,818000.0,AL,159000.0,1136000.0,1998,71.0
1,0,55000.0,55.0,0.64,2112000.0,AL,1485000.0,3300000.0,1998,60.0


In [75]:
# Ordenar por el valor de una columna

df.sort_values(by = 'numcol', ascending = False).head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col,numcol_K
532,AL,510000.0,91.0,46410000.0,12995000.0,1.5,69615000.0,2010,0,510.0
612,AL,480000.0,69.0,33120000.0,5962000.0,1.92,63590000.0,2012,0,480.0
220,AL,480000.0,67.0,32160000.0,6432000.0,1.39,44702000.0,2003,0,480.0


In [79]:
# Ordenar por más de un campo

df.sort_values(by = ['numcol', 'year'], ascending = True).head(3)

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,new_col,numcol_K
233,AL,2000.0,42.0,84000.0,21000.0,1.93,162000.0,2003,0,2.0
13,AL,3000.0,50.0,150000.0,51000.0,1.4,210000.0,1998,0,3.0
56,AL,3000.0,50.0,150000.0,12000.0,1.24,186000.0,1999,0,3.0


## 5. Agrupación

# WORK IN PROGRESS