In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

In [None]:
filename = '/content/drive/MyDrive/CargaData/PandasParaManipulacionDeDatos/sales_predictions.csv'
ventas_df = pd.read_csv(filename)
ventas_df.info()

In [None]:
ventas_df['Item_Weight'] = ventas_df.loc[:, 'Item_Weight'].fillna(method='ffill') #Rellena copiando el dato de la fila anterior
ventas_df['Outlet_Size'] = ventas_df.loc[:, 'Outlet_Size'].fillna(method='ffill') #Rellena copiando el dato de la fila anterior
ventas_df.info() #Item_Weight, Outlet_Size

In [6]:
#Eliminar duplicados
ventas_df = ventas_df.drop_duplicates()

In [21]:
ventas_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [11]:
X = ventas_df.drop(columns = ['Item_Outlet_Sales']) # MATRIZ
y = ventas_df['Item_Outlet_Sales'] # TARGET U OBJETIVO

#Train test split en los datos. Se utiliza el número 42 por coherencia.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # LOS DATOS YA ESTÁN DIVIDIDOS EN TRAIN Y TEST PARA LA MATRIZ X Y PARA EL GRUPO TARGET Y

In [12]:
# Selectores
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

# Imputers: Más frecuente y Media
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

# Scaler
scaler = StandardScaler()

# One-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [17]:
# Pipelines Numéricos
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [18]:
# Pipelines de Categorías
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

In [19]:
# Tuples para Column Transformer
number_tuple = (numeric_pipe, num_selector)
category_tuple = (categorical_pipe, cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, category_tuple, remainder = 'passthrough')
preprocessor

In [20]:
#Transformando datos
preprocessor.fit(X_train)

# transform train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)