## Pandas (Numpy) vs Pandas 2.0 (Pyarrow) 🐼🐍


In [2]:
#Librerias
import pandas as pd 
import numpy as np 
import pyarrow as pa
import time
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
from numpy import dtype

In [3]:
#Instalamos nueva version de pandas
#pip install --upgrade --pre pandas==2.0.0rc0
pd.__version__


'2.0.0rc0'

In [7]:
#Descargamos datos

url = "https://www.inegi.org.mx/contenidos/programas/enoe/15ymas/microdatos/enoe_n_2022_trim4_csv.zip"
resp_enoe = urlopen(url)
ZipFile(BytesIO(resp_enoe.read())).extractall()

### Pandas(numpy): read_csv

In [4]:
#Pandas read_csv 
t0 = time.time()
df_enoe1 = pd.read_csv('ENOEN_SDEMT422.csv', 
                      encoding= 'latin-1',
                      usecols= ['sex','ingocup','emp_ppal'] )

t1 = time.time()

performance_pd1= t1-t0
performance_pd1

  df_enoe1 = pd.read_csv('ENOEN_SDEMT422.csv',


2.3867928981781006

In [9]:
#Pandas read_csv (pyarrow)
pd.options.mode.dtype_backend = 'pyarrow'

t0 = time.time()
df_enoe2 = pd.read_csv('ENOEN_SDEMT422.csv', 
                      encoding= 'latin-1',
                      usecols= ['sex','ingocup','emp_ppal'],
                      engine='pyarrow')

t1 = time.time()

performance_pd2= t1-t0
performance_pd2

0.33231568336486816

In [10]:
abs((performance_pd2/performance_pd1) -1 )*100

-86.07689491541

## Transformación de variable, filtro, agrupamiento y cálculo de media de ingresos.

In [11]:
#Ingreso laboral promedio de mujeres y hombres
t0 = time.time()
df_enoe1['sexo'] = df_enoe1['sex'].map({1:'Hombre', 2:'Mujer', 
                                   '1':'Hombre', '2':'Mujer'})
df_enoe1\
    .query('ingocup > 0 & sexo.notna()' )\
    .groupby('sexo')['ingocup']\
    .mean()
t1 = time.time()
performance_pd_group1= t1-t0
performance_pd_group1


0.5472359657287598

In [12]:
#Ingreso laboral promedio de mujeres y hombres (arrow)
t0 = time.time()
df_enoe2['sexo'] = df_enoe2['sex'].map({1:'Hombre', 2:'Mujer', 
                                   '1':'Hombre', '2':'Mujer'})
df_enoe2\
    .query('ingocup > 0 & sexo.notna()' )\
    .groupby('sexo')['ingocup']\
    .mean()
t1 = time.time()
performance_pd_group2= t1-t0
performance_pd_group2


0.07199859619140625

In [13]:
#Comparación 
abs(((performance_pd_group2/performance_pd_group1) -1 )*100)

86.8432265603148

## Creación de dataframes y definición de tipo de variable de forma "manual"

In [14]:
#Ingresos float64
df_enoe_numpy = pd.DataFrame({
    'Ingreso' :  pd.Series(df_enoe1['ingocup'], dtype="float64")})
print(df_enoe_numpy.head(10))

   Ingreso
0      0.0
1      0.0
2  16000.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0


In [15]:
#Ingresos float64[pyarrow]
df_enoe_arrow= pd.DataFrame({
    'Ingreso' :  pd.Series(df_enoe1['ingocup'], dtype ='float64[pyarrow]')})
print(df_enoe_arrow.head(10))

   Ingreso
0      0.0
1      0.0
2  16000.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0


In [19]:
#Media numpy
t0 = time.time()
df_enoe_numpy.Ingreso.mean()
t1 = time.time()
performance_mean1 = t1 - t0
print("{:.10f}".format(performance_mean1))

0.0079810619


In [23]:
#Media  arrow
t0 = time.time()
df_enoe_arrow.Ingreso.mean()
t1 = time.time()
performance_mean2 = t1 - t0
print("{:.10f}".format(performance_mean2))


0.0080070496


In [24]:
#Comparación 
abs(((performance_mean2/performance_mean1) - 1 )*100)

0.32561613144137525

In [None]:
#Tamaño de dataframes en megabytes
# Sumar los tamaños de todas las columnas
total_size = df_enoe_numpy.memory_usage(deep = True).sum()
# Convertir a megabytes
total_size_mb = total_size / 1024 / 1024

print(f"El tamaño del dataframe es de {total_size_mb:.2f} MB")