# Preprocesamiento y preparación de datos: limpieza y formatos

**SI3015 - Fundamentos de Aprendizaje Automático**

Los códigos que se describen a continuación están basados en el documento *semana3_prepLimpiezaDatos.pdf*.

In [None]:
import pandas as pd

In [None]:
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9.597, 1.221],
"population":[200.4, 143.5, 1252, 1357, 52.98]}

df = pd.DataFrame(dict)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


**Inspección rápida**

In [None]:
# Ver las primeras 5 filas
df.head()

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [None]:
# Ver las ultimas 5 filas
df.tail()

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,1.221,52.98


In [None]:
# Dimensiones (filas, columnas)
df.shape

(5, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     5 non-null      object 
 1   capital     5 non-null      object 
 2   area        5 non-null      float64
 3   population  5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes


In [None]:
# Tipos de datos y memoria usada
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     5 non-null      object 
 1   capital     5 non-null      object 
 2   area        5 non-null      float64
 3   population  5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes


In [None]:
# Ver los tipos de datos de cada columna.
df.dtypes

Unnamed: 0,0
country,object
capital,object
area,float64
population,float64


In [None]:
# Estadisticas descriptivas (media, min, max, percentiles)
df.describe()

Unnamed: 0,area,population
count,5.0,5.0
mean,7.944,601.176
std,6.200557,645.261454
min,1.221,52.98
25%,3.286,143.5
50%,8.516,200.4
75%,9.597,1252.0
max,17.1,1357.0


In [None]:
# Conteo de valores unicos por columna
df.nunique()

Unnamed: 0,0
country,5
capital,5
area,5
population,5


In [None]:
# Conteo de valores unicos por columna
df["country"].nunique()

5

**Manejo de Valores Faltantes (NaNs)**

In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,,


In [None]:
# Detectar nulos por columna (suma total)
df.isnull().sum()

Unnamed: 0,0
country,0
capital,0
area,2
population,1


In [None]:
# Total de valores nulos en todo el DataFrame
df.isnull().sum().sum()

np.int64(3)

In [None]:
# Porcentaje de NaN por columna
df.isnull().sum() / len(df) * 100

Unnamed: 0,0
country,0.0
capital,0.0
area,40.0
population,20.0


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Eliminar filas que tengan AL MENOS un nulo
df_limpio=df.dropna()

df_limpio

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
3,China,Beijing,9.597,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Eliminar columnas que tengan AL MENOS un nulo
df_limpio = df.dropna(axis=1)

df_limpio

Unnamed: 0,country,capital
0,Brazil,Brasilia
1,Russia,Moscow
2,India,New Delhi
3,China,Beijing
4,South Africa,Pretoria


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Deja filas que tienen N cantidad de valores no nulos en adelante.
df_limpio = df.dropna(thresh=2)

df_limpio

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", nan],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Eliminar fila donde TODAS los valores sean nulos.
df_limpio = df.dropna(how='all')

df_limpio

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Rellenar nulos con un valor constante.
df.fillna(0, inplace=True)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,0.0,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,0.0,0.0


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Rellenar nulos con un valor constante en una columna especifica.
df.fillna({"area": 0}, inplace=True)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,0.0,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,0.0,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Imputacion inteligente: usar la media
df.fillna({"area": df["area"].mean()}, inplace=True)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,11.737667,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,11.737667,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Rellena con el valor anterior.
df["area"]=df["area"].bfill()

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,9.597,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Rellena con el valor posterior.
df["area"]=df["area"].ffill()

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,17.1,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,9.597,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Rellena con interpolacion.
df["area"]=df["area"].interpolate()

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,13.3485,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Pretoria,9.597,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

df['capital']=df['capital'].fillna('Desconocido')

df



Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,Desconocido,,


**Manipulación de Filas y Columnas**

In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Eliminar columnas especificas
df.drop(['area', 'capital'], axis=1, inplace=True)

df

Unnamed: 0,country,population
0,Brazil,200.4
1,Russia,143.5
2,India,1252.0
3,China,1357.0
4,South Africa,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Eliminar filas por indice
df.drop([0, 1, 2], axis=0)

Unnamed: 0,country,capital,area,population
3,China,Beijing,9.597,1357.0
4,South Africa,,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Renombrar columnas
df.rename(columns={'capital': 'city'}, inplace=True)

df

Unnamed: 0,country,city,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Renombrar todas las columnas.
df.columns = ['pais', 'ciudad', 'region', 'poblacion']

df

Unnamed: 0,pais,ciudad,region,poblacion
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
3,China,Beijing,9.597,1357.0
4,South Africa,,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Reordenar columnas.
df=df[['capital', 'country', 'population', 'area']]

df

Unnamed: 0,capital,country,population,area
0,Brasilia,Brazil,200.4,8.516
1,Moscow,Russia,143.5,17.1
2,New Delhi,India,1252.0,
3,Beijing,China,1357.0,9.597
4,,South Africa,,


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", nan],
"area":[8.516, 17.10, nan, 9.597, nan],
"population":[200.4, 143.5, 1252, 1357, nan]}

df = pd.DataFrame(dict)

# Crear una nueva columna basada en otras
df['total'] = df['area'] * df['population']

df

Unnamed: 0,country,capital,area,population,total
0,Brazil,Brasilia,8.516,200.4,1706.6064
1,Russia,Moscow,17.1,143.5,2453.85
2,India,New Delhi,,1252.0,
3,China,Beijing,9.597,1357.0,13023.129
4,South Africa,,,,


**Limpieza de Texto y Duplicados**

In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "India", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "New Delhi", nan],
"area":[8.516, 17.10, nan, nan, nan],
"population":[200.4, 143.5, 1252, 1252, nan]}

df = pd.DataFrame(dict)

# Devuelve una Serie booleana indicando filas duplicadas (la primera ocurrencia no se marca).
df.duplicated()

# Cuenta el numero total de filas duplicadas.
df.duplicated().sum()


np.int64(1)

In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Busca duplicados solo en un subconjunto de columnas.
df.duplicated(subset=['area', 'population'])

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True


In [None]:
from numpy import nan
dict = {
"country":[" Brazil", "Russia", "India", "China", "South Africa"],
"capital":[" Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Convertir a minusculas y quitar espacios en blanco
df['country'] = df['country'].str.lower().str.strip()

df



Unnamed: 0,country,capital,area,population
0,brazil,Brasilia,8.516,200.4
1,russia,Moscow,17.1,143.5
2,india,New Delhi,3.286,1252.0
3,china,Beijing,9597.0,1357.0
4,south africa,Pretoria,9597.0,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Reemplazar valores de texto
df['country']=df['country'].replace('Brazzil', 'Brazil')

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9597.0,1357.0
4,South Africa,Pretoria,9597.0,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazil", "Russia", "India", "India", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "New Delhi", nan],
"area":[8.516, 17.10, nan, nan, nan],
"population":[200.4, 143.5, 1252, 1252, nan]}

df = pd.DataFrame(dict)

# Eliminar filas duplicadas
df.drop_duplicates(inplace=True)

df

Unnamed: 0,country,capital,area,population
0,Brazil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,,1252.0
4,South Africa,,,


In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Elimina duplicados basandose en un subconjunto de columnas, manteniendo la ultima ocurrencia.
df=df.drop_duplicates(subset=['area', 'population'])

df

Unnamed: 0,country,capital,area,population
0,Brazzil,Brasilia,8.516,200.4
1,Russia,Moscow,17.1,143.5
2,India,New Delhi,3.286,1252.0
3,China,Beijing,9597.0,1357.0


**Consistencia y Validación Lógica**

In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Conservamos las filas que cumplan con que el area sea mayor a 17 y menor a 5000.
df=df[(df['area'] > 17) & (df['population'] < 5000)]

df

Unnamed: 0,country,capital,area,population
1,Russia,Moscow,17.1,143.5
3,China,Beijing,9597.0,1357.0
4,South Africa,Pretoria,9597.0,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 8.516, 8.516, 8.516, 8.516],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Eliminar columnas que tienen un unico valor en todas las filas, ya que no aportan informacion.
df=df.loc[:, df.nunique() > 1]

df

Unnamed: 0,country,capital,population
0,Brazzil,Brasilia,200.4
1,Russia,Moscow,143.5
2,India,New Delhi,1252.0
3,China,Beijing,1357.0
4,South Africa,Pretoria,1357.0


**Transformación de Tipos y Filtrado**

In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 17.10, 3.286, 9597, 9597],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Cambiar tipo de dato (ej: float a int)
df['area'] = df['area'].astype(int)

df.dtypes

Unnamed: 0,0
country,object
capital,object
area,int64
population,float64


In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"fecha":["15/05/2024", "15/05/2024", "15/05/2024", "15/05/2024", "15/05/2024"],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Convertir columna a formato Fecha
df['fecha'] = pd.to_datetime(df['fecha'])

df


  df['fecha'] = pd.to_datetime(df['fecha'])


Unnamed: 0,country,capital,fecha,population
0,Brazzil,Brasilia,2024-05-15,200.4
1,Russia,Moscow,2024-05-15,143.5
2,India,New Delhi,2024-05-15,1252.0
3,China,Beijing,2024-05-15,1357.0
4,South Africa,Pretoria,2024-05-15,1357.0


In [None]:
from numpy import nan
dict = {
"country":["Brazzil", "Russia", "India", "China", "South Africa"],
"capital":["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
"area":[8.516, 8.516, 8.516, 8.516, 8.516],
"population":[200.4, 143.5, 1252, 1357, 1357]}

df = pd.DataFrame(dict)

# Filtrar datos bajo una condicion
df=df[df['population'] < 1000]

df



Unnamed: 0,country,capital,area,population
0,Brazzil,Brasilia,8.516,200.4
1,Russia,Moscow,8.516,143.5


**Agregación y Agrupamiento**

In [None]:
# 1. Definimos los datos en un diccionario
datos = {
    'id': [1, 2, 3, 4, 5],
    'categoria': ['Electrónica', 'Hogar', 'Electrónica', 'Hogar', 'Ropa'],
    'ventas': [800, 150, 1200, 350, 100],
    'producto': ['Laptop', 'Lámpara', 'Smartphone', 'Silla', 'Camiseta']
}

# 2. Convertimos el diccionario en un DataFrame de Pandas
df = pd.DataFrame(datos)

# 3. Ejecutamos el cálculo del promedio por categoría
promedio_ventas = df.groupby('categoria')['ventas'].mean()

promedio_ventas

Unnamed: 0,id,categoria,ventas,producto
0,1,Electrónica,800,Laptop
1,2,Hogar,150,Lámpara
2,3,Electrónica,1200,Smartphone
3,4,Hogar,350,Silla
4,5,Ropa,100,Camiseta


In [None]:
import pandas as pd

data = {
    'Region': ['Norte', 'Norte', 'Sur', 'Sur', 'Norte', 'Norte', 'Sur', 'Sur'],
    'Mes': ['Enero', 'Febrero', 'Enero', 'Febrero', 'Enero', 'Febrero', 'Enero', 'Febrero'],
    'Producto': ['Televisor', 'Televisor', 'Televisor', 'Televisor', 'Radio', 'Radio', 'Radio', 'Radio'],
    'Ventas': [1500, 1200, 900, 1100, 400, 450, 300, 320]
}

df = pd.DataFrame(data)

print(df)

# Creamos el pivote
pivote_ventas = df.pivot_table(
    index='Region',    # Lo que queremos en las filas
    columns='Mes',     # Lo que queremos en las columnas
    values='Ventas',   # El dato numérico que vamos a analizar
    aggfunc='sum'      # Si hay varios registros, que los sume
)

print(" ")
print(pivote_ventas)

  Region      Mes   Producto  Ventas
0  Norte    Enero  Televisor    1500
1  Norte  Febrero  Televisor    1200
2    Sur    Enero  Televisor     900
3    Sur  Febrero  Televisor    1100
4  Norte    Enero      Radio     400
5  Norte  Febrero      Radio     450
6    Sur    Enero      Radio     300
7    Sur  Febrero      Radio     320
 
Mes     Enero  Febrero
Region                
Norte    1900     1650
Sur      1200     1420


In [None]:
# Dataset de ejemplo
data = {'ID': [1, 2, 3, 4],
        'Color': ['Rojo', 'Verde', 'Azul', 'Rojo']}
df = pd.DataFrame(data)

# Aplicamos One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['Color'])

print(df_encoded)

   ID  Color_Azul  Color_Rojo  Color_Verde
0   1       False        True        False
1   2       False       False         True
2   3        True       False        False
3   4       False        True        False
