# Intro a Pandas
## Estructuras básicas

Pandas nos provee de dos tipos de clases para manipular datos:
* Series: Arreglos de una dimensión que pueden contener cualquier tipo de datos
* DataFrame: Arreglos de dos dimensiones, manejan datos como si tuviesemos una tabla

In [1]:
# Importamos pandas
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [3]:
dates = pd.date_range("20250101", periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list("ABCD"))
print(df)

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2025-01-01 -0.951191  0.606906 -0.761468  0.286661
2025-01-02  0.634464 -1.443576  0.376403 -1.012387
2025-01-03  0.271688  0.932401  0.089977 -0.509898
2025-01-04  0.416721 -0.382602  0.877150  1.575109
2025-01-05  0.489411 -1.079524  0.814877  1.348473
2025-01-06  0.200562  0.319317 -0.651103 -0.427490


In [4]:
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20250206"),
    "C": pd.Series(1,index = list(range(4)), dtype ="float32"),
    "D": np.array([3]*4, dtype = "int32"),
    "E": pd.Categorical(["test","train","test","traimn"]),
    "F": "foo",
})
print(df2)
print(df2.dtypes)

     A          B    C  D       E    F
0  1.0 2025-02-06  1.0  3    test  foo
1  1.0 2025-02-06  1.0  3   train  foo
2  1.0 2025-02-06  1.0  3    test  foo
3  1.0 2025-02-06  1.0  3  traimn  foo
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


## Visualizando los datos

In [5]:
df.head(2)

Unnamed: 0,A,B,C,D
2025-01-01,-0.951191,0.606906,-0.761468,0.286661
2025-01-02,0.634464,-1.443576,0.376403,-1.012387


In [6]:
df.tail(2)

Unnamed: 0,A,B,C,D
2025-01-05,0.489411,-1.079524,0.814877,1.348473
2025-01-06,0.200562,0.319317,-0.651103,-0.42749


In [7]:
df.index

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [9]:
df.to_numpy()

array([[-0.9511914 ,  0.60690604, -0.76146823,  0.28666118],
       [ 0.63446426, -1.44357612,  0.37640291, -1.01238734],
       [ 0.27168793,  0.932401  ,  0.08997729, -0.50989789],
       [ 0.41672074, -0.38260206,  0.87715015,  1.57510877],
       [ 0.4894112 , -1.07952412,  0.81487702,  1.34847261],
       [ 0.20056189,  0.31931665, -0.65110325, -0.42749016]])

In [10]:
df.describe() # Resumen estadístico

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.176942,-0.174513,0.124306,0.210078
std,0.573878,0.954213,0.706293,1.056825
min,-0.951191,-1.443576,-0.761468,-1.012387
25%,0.218343,-0.905294,-0.465833,-0.489296
50%,0.344204,-0.031643,0.23319,-0.070414
75%,0.471239,0.535009,0.705258,1.08302
max,0.634464,0.932401,0.87715,1.575109


In [11]:
df.T # Transpuesta

Unnamed: 0,2025-01-01,2025-01-02,2025-01-03,2025-01-04,2025-01-05,2025-01-06
A,-0.951191,0.634464,0.271688,0.416721,0.489411,0.200562
B,0.606906,-1.443576,0.932401,-0.382602,-1.079524,0.319317
C,-0.761468,0.376403,0.089977,0.87715,0.814877,-0.651103
D,0.286661,-1.012387,-0.509898,1.575109,1.348473,-0.42749


In [12]:
# Ordenar df por un eje
df.sort_index(axis =1, ascending=False)

Unnamed: 0,D,C,B,A
2025-01-01,0.286661,-0.761468,0.606906,-0.951191
2025-01-02,-1.012387,0.376403,-1.443576,0.634464
2025-01-03,-0.509898,0.089977,0.932401,0.271688
2025-01-04,1.575109,0.87715,-0.382602,0.416721
2025-01-05,1.348473,0.814877,-1.079524,0.489411
2025-01-06,-0.42749,-0.651103,0.319317,0.200562


In [13]:
df.sort_values(by="B") # Ordenar valores por 

Unnamed: 0,A,B,C,D
2025-01-02,0.634464,-1.443576,0.376403,-1.012387
2025-01-05,0.489411,-1.079524,0.814877,1.348473
2025-01-04,0.416721,-0.382602,0.87715,1.575109
2025-01-06,0.200562,0.319317,-0.651103,-0.42749
2025-01-01,-0.951191,0.606906,-0.761468,0.286661
2025-01-03,0.271688,0.932401,0.089977,-0.509898


## Seleccionando datos

In [14]:
df["A"]

2025-01-01   -0.951191
2025-01-02    0.634464
2025-01-03    0.271688
2025-01-04    0.416721
2025-01-05    0.489411
2025-01-06    0.200562
Freq: D, Name: A, dtype: float64

In [15]:
df[0:2]

Unnamed: 0,A,B,C,D
2025-01-01,-0.951191,0.606906,-0.761468,0.286661
2025-01-02,0.634464,-1.443576,0.376403,-1.012387


In [16]:
df[2:]

Unnamed: 0,A,B,C,D
2025-01-03,0.271688,0.932401,0.089977,-0.509898
2025-01-04,0.416721,-0.382602,0.87715,1.575109
2025-01-05,0.489411,-1.079524,0.814877,1.348473
2025-01-06,0.200562,0.319317,-0.651103,-0.42749


In [17]:
df["20250103":"20250105"]

Unnamed: 0,A,B,C,D
2025-01-03,0.271688,0.932401,0.089977,-0.509898
2025-01-04,0.416721,-0.382602,0.87715,1.575109
2025-01-05,0.489411,-1.079524,0.814877,1.348473


In [18]:
# .loc nos sirve para seleccionar por etiquetas
df.loc[dates[0]] # Coincide con 2025-01-01

A   -0.951191
B    0.606906
C   -0.761468
D    0.286661
Name: 2025-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2025-01-01,-0.951191,0.606906
2025-01-02,0.634464,-1.443576
2025-01-03,0.271688,0.932401
2025-01-04,0.416721,-0.382602
2025-01-05,0.489411,-1.079524
2025-01-06,0.200562,0.319317


In [20]:
df.loc[dates[0],"A"]

-0.95119140290691

In [21]:
# .iloc permite acceso por posición
df.iloc[0]

A   -0.951191
B    0.606906
C   -0.761468
D    0.286661
Name: 2025-01-01 00:00:00, dtype: float64

In [22]:
df.iloc[0,0]

-0.95119140290691

In [23]:
df.iloc[:3,2:]

Unnamed: 0,C,D
2025-01-01,-0.761468,0.286661
2025-01-02,0.376403,-1.012387
2025-01-03,0.089977,-0.509898


## Indexación Booleana

In [24]:
#Seleccionar filas donde df.A sea mayor que 0
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2025-01-02,0.634464,-1.443576,0.376403,-1.012387
2025-01-03,0.271688,0.932401,0.089977,-0.509898
2025-01-04,0.416721,-0.382602,0.87715,1.575109
2025-01-05,0.489411,-1.079524,0.814877,1.348473
2025-01-06,0.200562,0.319317,-0.651103,-0.42749


In [25]:
#Filtro de elementos que sea mayores a 0
df[df>0]

Unnamed: 0,A,B,C,D
2025-01-01,,0.606906,,0.286661
2025-01-02,0.634464,,0.376403,
2025-01-03,0.271688,0.932401,0.089977,
2025-01-04,0.416721,,0.87715,1.575109
2025-01-05,0.489411,,0.814877,1.348473
2025-01-06,0.200562,0.319317,,


In [26]:
#Copiar df
df2 = df.copy()
df2["E"] = ["uno", "uno", "dos", "tres", "cuatro", "tres"]
df2

Unnamed: 0,A,B,C,D,E
2025-01-01,-0.951191,0.606906,-0.761468,0.286661,uno
2025-01-02,0.634464,-1.443576,0.376403,-1.012387,uno
2025-01-03,0.271688,0.932401,0.089977,-0.509898,dos
2025-01-04,0.416721,-0.382602,0.87715,1.575109,tres
2025-01-05,0.489411,-1.079524,0.814877,1.348473,cuatro
2025-01-06,0.200562,0.319317,-0.651103,-0.42749,tres


In [27]:
#Método de filtrado isin()
df2[df2["E"].isin(["dos", "cuatro"])]

Unnamed: 0,A,B,C,D,E
2025-01-03,0.271688,0.932401,0.089977,-0.509898,dos
2025-01-05,0.489411,-1.079524,0.814877,1.348473,cuatro


 ## Establecer

In [28]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range("20250101", periods=6))
df["F"] = s1

In [29]:
#Estableciendo por etiqueta
df.at[dates[0],"A"] = 0

In [30]:
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.606906,-0.761468,0.286661,1
2025-01-02,0.634464,-1.443576,0.376403,-1.012387,2
2025-01-03,0.271688,0.932401,0.089977,-0.509898,3
2025-01-04,0.416721,-0.382602,0.87715,1.575109,4
2025-01-05,0.489411,-1.079524,0.814877,1.348473,5
2025-01-06,0.200562,0.319317,-0.651103,-0.42749,6


In [31]:
#Estableciendo por posición
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-0.761468,0.286661,1
2025-01-02,0.634464,-1.443576,0.376403,-1.012387,2
2025-01-03,0.271688,0.932401,0.089977,-0.509898,3
2025-01-04,0.416721,-0.382602,0.87715,1.575109,4
2025-01-05,0.489411,-1.079524,0.814877,1.348473,5
2025-01-06,0.200562,0.319317,-0.651103,-0.42749,6


In [32]:
#Estableciendo valores con numpy
df.loc[:, "D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-0.761468,5.0,1
2025-01-02,0.634464,-1.443576,0.376403,5.0,2
2025-01-03,0.271688,0.932401,0.089977,5.0,3
2025-01-04,0.416721,-0.382602,0.87715,5.0,4
2025-01-05,0.489411,-1.079524,0.814877,5.0,5
2025-01-06,0.200562,0.319317,-0.651103,5.0,6


In [33]:
#Establecer donde se cumple la condición
df2 = df.copy()
df[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-0.761468,5.0,1
2025-01-02,0.634464,-1.443576,0.376403,5.0,2
2025-01-03,0.271688,0.932401,0.089977,5.0,3
2025-01-04,0.416721,-0.382602,0.87715,5.0,4
2025-01-05,0.489411,-1.079524,0.814877,5.0,5
2025-01-06,0.200562,0.319317,-0.651103,5.0,6


## Datos faltantes

En el caso de numpy nos encontraremos los NaN

In [34]:
#Reindex nos permite editar/eliminar/agregar el indíce en un eje específico
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ["E"])
df1.loc[dates[0]:dates[1], "E"] = 1 #Rellenamos E en 0 y 1 con 1
df1

Unnamed: 0,A,B,C,D,F,E
2025-01-01,0.0,0.0,-0.761468,-5.0,-1,1.0
2025-01-02,-0.634464,-1.443576,-0.376403,-5.0,-2,1.0
2025-01-03,-0.271688,-0.932401,-0.089977,-5.0,-3,
2025-01-04,-0.416721,-0.382602,-0.87715,-5.0,-4,


In [35]:
#Retirar cualquier fila donde hay datos faltantes
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2025-01-01,0.0,0.0,-0.761468,-5.0,-1,1.0
2025-01-02,-0.634464,-1.443576,-0.376403,-5.0,-2,1.0


In [36]:
# Rellenar donde hay valores faltantes
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2025-01-01,0.0,0.0,-0.761468,-5.0,-1,1.0
2025-01-02,-0.634464,-1.443576,-0.376403,-5.0,-2,1.0
2025-01-03,-0.271688,-0.932401,-0.089977,-5.0,-3,5.0
2025-01-04,-0.416721,-0.382602,-0.87715,-5.0,-4,5.0


In [37]:
# Generar máscara donde los valores sean NaN
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2025-01-01,False,False,False,False,False,False
2025-01-02,False,False,False,False,False,False
2025-01-03,False,False,False,False,False,True
2025-01-04,False,False,False,False,False,True


## Operaciones

In [38]:
df.mean()

A   -0.335474
B   -0.692903
C   -0.595163
D   -5.000000
F   -3.500000
dtype: float64

In [39]:
df.mean(axis = 1)

2025-01-01   -1.352294
2025-01-02   -1.890889
2025-01-03   -1.858813
2025-01-04   -2.135295
2025-01-05   -2.476762
2025-01-06   -2.434196
Freq: D, dtype: float64

In [40]:
s = pd.Series([1,3,5,np.nan,6,8], index = dates).shift(2)
print(df.sub(s, axis="index")) # Realizamos la sustracción
print(s)

                   A         B         C     D     F
2025-01-01       NaN       NaN       NaN   NaN   NaN
2025-01-02       NaN       NaN       NaN   NaN   NaN
2025-01-03 -1.271688 -1.932401 -1.089977  -6.0  -4.0
2025-01-04 -3.416721 -3.382602 -3.877150  -8.0  -7.0
2025-01-05 -5.489411 -6.079524 -5.814877 -10.0 -10.0
2025-01-06       NaN       NaN       NaN   NaN   NaN
2025-01-01    NaN
2025-01-02    NaN
2025-01-03    1.0
2025-01-04    3.0
2025-01-05    5.0
2025-01-06    NaN
Freq: D, dtype: float64


## Aplicamos funciones definidas por usuario

In [41]:
df.agg(lambda x: np.mean(x) * 5.6) #Aplica la función sobre un eje

A    -1.878656
B    -3.880259
C    -3.332914
D   -28.000000
F   -19.600000
dtype: float64

In [42]:
df.agg(lambda x: np.mean(x) * 5.6, axis = 1)

2025-01-01    -7.572844
2025-01-02   -10.588976
2025-01-03   -10.409354
2025-01-04   -11.957650
2025-01-05   -13.869870
2025-01-06   -13.631500
Freq: D, dtype: float64

In [43]:
df.transform(lambda x: x*101.2) # Aplica la función con la misma forma

Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-77.060584,-506.0,-101.2
2025-01-02,-64.207783,-146.089903,-38.091974,-506.0,-202.4
2025-01-03,-27.494819,-94.358981,-9.105702,-506.0,-303.6
2025-01-04,-42.172139,-38.719328,-88.767595,-506.0,-404.8
2025-01-05,-49.528413,-109.247841,-82.465554,-506.0,-506.0
2025-01-06,-20.296864,-32.314845,-65.891649,-506.0,-607.2


## Contar valores específicos

In [44]:
s = pd.Series(np.random.randint(0,7,size=10))
print(s, s.value_counts())

0    6
1    5
2    0
3    5
4    0
5    1
6    6
7    1
8    5
9    0
dtype: int32 5    3
0    3
6    2
1    2
Name: count, dtype: int64


## Unión de DataFrames

In [45]:
# Concatenar
piezas = [df[:2], df[2:4], df[4:]] #Cortamos el dataframe
print(piezas)
pd.concat(piezas) #Concatenar o unir

[                   A         B         C    D  F
2025-01-01  0.000000  0.000000 -0.761468 -5.0 -1
2025-01-02 -0.634464 -1.443576 -0.376403 -5.0 -2,                    A         B         C    D  F
2025-01-03 -0.271688 -0.932401 -0.089977 -5.0 -3
2025-01-04 -0.416721 -0.382602 -0.877150 -5.0 -4,                    A         B         C    D  F
2025-01-05 -0.489411 -1.079524 -0.814877 -5.0 -5
2025-01-06 -0.200562 -0.319317 -0.651103 -5.0 -6]


Unnamed: 0,A,B,C,D,F
2025-01-01,0.0,0.0,-0.761468,-5.0,-1
2025-01-02,-0.634464,-1.443576,-0.376403,-5.0,-2
2025-01-03,-0.271688,-0.932401,-0.089977,-5.0,-3
2025-01-04,-0.416721,-0.382602,-0.87715,-5.0,-4
2025-01-05,-0.489411,-1.079524,-0.814877,-5.0,-5
2025-01-06,-0.200562,-0.319317,-0.651103,-5.0,-6


In [46]:
izq = pd.DataFrame({"key": ["foo", "foo","foo", "foo"], "lval":[1,2,6,7]})
der = pd.DataFrame({"key": ["foo", "foo","foo", "foo"], "rval":[4,5,8,9]})
pd.merge(izq, der, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,1,8
3,foo,1,9
4,foo,2,4
5,foo,2,5
6,foo,2,8
7,foo,2,9
8,foo,6,4
9,foo,6,5


In [47]:
izq = pd.DataFrame({"key": ["foo", "bar","sar", "goo"], "lval":[1,2,6,7]})
der = pd.DataFrame({"key": ["foo", "bar","sar", "goo"], "rval":[4,5,8,9]})
pd.merge(izq, der, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5
2,sar,6,8
3,goo,7,9


In [48]:
izq = pd.DataFrame({"key": ["foo", "bar","sar", "goo"], "lval":[1,2,6,7]})
der = pd.DataFrame({"key": ["foo", "bar","sar", "goo"], "rval":[4,5,8,9]})
df3 = pd.merge(izq, der, on="key")

In [49]:
df3.groupby("key")[["lval"]].sum() # Agrupamiento por etiquetas

Unnamed: 0_level_0,lval
key,Unnamed: 1_level_1
bar,2
foo,1
goo,7
sar,6


In [50]:
df4 = df3[:3]
df4

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5
2,sar,6,8


In [51]:
stacked = df4.stack(future_stack=True)
stacked

0  key     foo
   lval      1
   rval      4
1  key     bar
   lval      2
   rval      5
2  key     sar
   lval      6
   rval      8
dtype: object

## Series de tiempo

In [52]:
rng = pd.date_range("1/1/2025", periods=100, freq="s")
ts = pd.Series(np.random.randint(0,500,len(rng)), index=rng)
ts.resample("5Min").sum()

2025-01-01    23966
Freq: 5min, dtype: int32

In [53]:
ts

2025-01-01 00:00:00    211
2025-01-01 00:00:01     98
2025-01-01 00:00:02     68
2025-01-01 00:00:03    120
2025-01-01 00:00:04    433
                      ... 
2025-01-01 00:01:35    197
2025-01-01 00:01:36    453
2025-01-01 00:01:37    390
2025-01-01 00:01:38    408
2025-01-01 00:01:39     22
Freq: s, Length: 100, dtype: int32

In [54]:
rng = pd.date_range("1/1/2025 00:00", periods=50, freq="D")
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2025-01-01    0.112502
2025-01-02   -0.307773
2025-01-03   -0.778046
2025-01-04    0.586082
2025-01-05   -1.191671
2025-01-06   -0.233312
2025-01-07    0.469025
2025-01-08    1.489502
2025-01-09    0.610390
2025-01-10   -0.726589
2025-01-11    0.292178
2025-01-12   -0.217682
2025-01-13    0.262434
2025-01-14    0.429306
2025-01-15    0.089714
2025-01-16   -0.426547
2025-01-17    1.443737
2025-01-18   -1.694599
2025-01-19    0.868797
2025-01-20   -0.282676
2025-01-21   -0.948079
2025-01-22   -0.092846
2025-01-23   -0.334034
2025-01-24   -1.266125
2025-01-25    0.980255
2025-01-26   -0.832952
2025-01-27    0.545766
2025-01-28    0.125765
2025-01-29   -0.480791
2025-01-30   -0.971413
2025-01-31    1.524220
2025-02-01    2.053462
2025-02-02    0.661636
2025-02-03    0.817378
2025-02-04   -0.609004
2025-02-05   -1.835525
2025-02-06   -0.020633
2025-02-07   -1.742487
2025-02-08   -1.066580
2025-02-09   -0.088348
2025-02-10   -2.175701
2025-02-11    0.440765
2025-02-12   -0.972465
2025-02-13 

In [55]:
ts_utc = ts.tz_localize("Europe/Warsaw")
ts_utc

2025-01-01 00:00:00+01:00    0.112502
2025-01-02 00:00:00+01:00   -0.307773
2025-01-03 00:00:00+01:00   -0.778046
2025-01-04 00:00:00+01:00    0.586082
2025-01-05 00:00:00+01:00   -1.191671
2025-01-06 00:00:00+01:00   -0.233312
2025-01-07 00:00:00+01:00    0.469025
2025-01-08 00:00:00+01:00    1.489502
2025-01-09 00:00:00+01:00    0.610390
2025-01-10 00:00:00+01:00   -0.726589
2025-01-11 00:00:00+01:00    0.292178
2025-01-12 00:00:00+01:00   -0.217682
2025-01-13 00:00:00+01:00    0.262434
2025-01-14 00:00:00+01:00    0.429306
2025-01-15 00:00:00+01:00    0.089714
2025-01-16 00:00:00+01:00   -0.426547
2025-01-17 00:00:00+01:00    1.443737
2025-01-18 00:00:00+01:00   -1.694599
2025-01-19 00:00:00+01:00    0.868797
2025-01-20 00:00:00+01:00   -0.282676
2025-01-21 00:00:00+01:00   -0.948079
2025-01-22 00:00:00+01:00   -0.092846
2025-01-23 00:00:00+01:00   -0.334034
2025-01-24 00:00:00+01:00   -1.266125
2025-01-25 00:00:00+01:00    0.980255
2025-01-26 00:00:00+01:00   -0.832952
2025-01-27 0

In [56]:
ts_utc = ts.tz_localize("US/Eastern")
ts_utc

2025-01-01 00:00:00-05:00    0.112502
2025-01-02 00:00:00-05:00   -0.307773
2025-01-03 00:00:00-05:00   -0.778046
2025-01-04 00:00:00-05:00    0.586082
2025-01-05 00:00:00-05:00   -1.191671
2025-01-06 00:00:00-05:00   -0.233312
2025-01-07 00:00:00-05:00    0.469025
2025-01-08 00:00:00-05:00    1.489502
2025-01-09 00:00:00-05:00    0.610390
2025-01-10 00:00:00-05:00   -0.726589
2025-01-11 00:00:00-05:00    0.292178
2025-01-12 00:00:00-05:00   -0.217682
2025-01-13 00:00:00-05:00    0.262434
2025-01-14 00:00:00-05:00    0.429306
2025-01-15 00:00:00-05:00    0.089714
2025-01-16 00:00:00-05:00   -0.426547
2025-01-17 00:00:00-05:00    1.443737
2025-01-18 00:00:00-05:00   -1.694599
2025-01-19 00:00:00-05:00    0.868797
2025-01-20 00:00:00-05:00   -0.282676
2025-01-21 00:00:00-05:00   -0.948079
2025-01-22 00:00:00-05:00   -0.092846
2025-01-23 00:00:00-05:00   -0.334034
2025-01-24 00:00:00-05:00   -1.266125
2025-01-25 00:00:00-05:00    0.980255
2025-01-26 00:00:00-05:00   -0.832952
2025-01-27 0

In [57]:
rng + pd.offsets.BusinessDay(5)

DatetimeIndex(['2025-01-08', '2025-01-09', '2025-01-10', '2025-01-10',
               '2025-01-10', '2025-01-13', '2025-01-14', '2025-01-15',
               '2025-01-16', '2025-01-17', '2025-01-17', '2025-01-17',
               '2025-01-20', '2025-01-21', '2025-01-22', '2025-01-23',
               '2025-01-24', '2025-01-24', '2025-01-24', '2025-01-27',
               '2025-01-28', '2025-01-29', '2025-01-30', '2025-01-31',
               '2025-01-31', '2025-01-31', '2025-02-03', '2025-02-04',
               '2025-02-05', '2025-02-06', '2025-02-07', '2025-02-07',
               '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12',
               '2025-02-13', '2025-02-14', '2025-02-14', '2025-02-14',
               '2025-02-17', '2025-02-18', '2025-02-19', '2025-02-20',
               '2025-02-21', '2025-02-21', '2025-02-21', '2025-02-24',
               '2025-02-25', '2025-02-26'],
              dtype='datetime64[ns]', freq=None)

## Categóricos

In [58]:
df = pd.DataFrame({"id": [1,2,3,4,5,6], "calificacion_crudo": ["a","b","f","b","b","a"]})
df

Unnamed: 0,id,calificacion_crudo
0,1,a
1,2,b
2,3,f
3,4,b
4,5,b
5,6,a


In [59]:
df["calificacion"] = df["calificacion_crudo"].astype("category")
df["calificacion"]

0    a
1    b
2    f
3    b
4    b
5    a
Name: calificacion, dtype: category
Categories (3, object): ['a', 'b', 'f']

In [60]:
nuevas_categorias = ["muy bueno", "bueno", "muy malo"]
df["calificacion"] = df["calificacion"].cat.rename_categories(nuevas_categorias)
df

Unnamed: 0,id,calificacion_crudo,calificacion
0,1,a,muy bueno
1,2,b,bueno
2,3,f,muy malo
3,4,b,bueno
4,5,b,bueno
5,6,a,muy bueno


In [61]:
df.sort_values(by="calificacion")

Unnamed: 0,id,calificacion_crudo,calificacion
0,1,a,muy bueno
5,6,a,muy bueno
1,2,b,bueno
3,4,b,bueno
4,5,b,bueno
2,3,f,muy malo


In [62]:
df.groupby("calificacion", observed=False).size()

calificacion
muy bueno    2
bueno        3
muy malo     1
dtype: int64

## CSV

In [63]:
df.to_csv("califs.csv")

In [64]:
pd.read_csv("califs.csv")

Unnamed: 0.1,Unnamed: 0,id,calificacion_crudo,calificacion
0,0,1,a,muy bueno
1,1,2,b,bueno
2,2,3,f,muy malo
3,3,4,b,bueno
4,4,5,b,bueno
5,5,6,a,muy bueno
