In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [15]:
#Lectura
data = pd.read_csv("customer_shopping_limpio.csv")

In [16]:
print(data.shape)
y_array = data["gasto_total"].values 
data = data.drop(columns=["quantity", "price","gasto_total"])
data.head(10)


(98553, 8)


Unnamed: 0,gender,age,category,payment_method,month
0,Female,28,Clothing,Credit Card,8
1,Male,21,Shoes,Debit Card,12
2,Male,20,Clothing,Cash,11
3,Female,66,Shoes,Credit Card,5
4,Female,53,Books,Cash,10
5,Female,28,Clothing,Credit Card,5
6,Female,49,Cosmetics,Cash,3
7,Female,32,Clothing,Credit Card,1
8,Male,69,Clothing,Credit Card,11
9,Female,60,Clothing,Credit Card,8


In [17]:
# Label Encoding solo para 'gender'
le = LabelEncoder()
data['gender'] = le.fit_transform(data['gender']) 

# Definir las columnas categóricas
columnas_categoricas = ['category', 'payment_method', 'month']

# Aplicar One-Hot Encoding
transformer = ColumnTransformer(
    transformers=[("encoder", OneHotEncoder(drop='first'), columnas_categoricas)],
    remainder='passthrough'  # Mantiene las columnas numéricas sin cambiar
)

# Aplicar la transformación
X = transformer.fit_transform(data)
# Verificar la forma de X
print("Shape of X after transformation:", X.shape)

# Convertir X de vuelta a una matriz densa si es un CSR (Compressed Sparse Row)
X_dense = X.toarray()

# Convertir X de vuelta a un DataFrame con los nombres adecuados de las columnas
ohe_columns = transformer.transformers_[0][1].get_feature_names_out(columnas_categoricas)

# Verificar cuántas columnas tenemos para One-Hot Encoding
print("Number of One-Hot encoded columns:", len(ohe_columns))

# Combinar las columnas One-Hot y las columnas numéricas
numerical_columns = ['gender', 'age']
print("Number of numerical columns:", len(numerical_columns))

# Asegúrate de que las columnas numéricas se añaden en el orden correcto
final_columns = list(ohe_columns) + numerical_columns
# Verificar el número total de columnas que estamos tratando de crear
print("Total number of columns in final_columns:", len(final_columns))
print(X)

X_df = pd.DataFrame(X_dense, columns=final_columns)

X_df.head()

Shape of X after transformation: (98553, 22)
Number of One-Hot encoded columns: 20
Number of numerical columns: 2
Total number of columns in final_columns: 22
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 373493 stored elements and shape (98553, 22)>
  Coords	Values
  (0, 0)	1.0
  (0, 7)	1.0
  (0, 15)	1.0
  (0, 21)	28.0
  (1, 3)	1.0
  (1, 8)	1.0
  (1, 19)	1.0
  (1, 20)	1.0
  (1, 21)	21.0
  (2, 0)	1.0
  (2, 18)	1.0
  (2, 20)	1.0
  (2, 21)	20.0
  (3, 3)	1.0
  (3, 7)	1.0
  (3, 12)	1.0
  (3, 21)	66.0
  (4, 17)	1.0
  (4, 21)	53.0
  (5, 0)	1.0
  (5, 7)	1.0
  (5, 12)	1.0
  (5, 21)	28.0
  (6, 1)	1.0
  (6, 10)	1.0
  :	:
  (98547, 6)	1.0
  (98547, 17)	1.0
  (98547, 20)	1.0
  (98547, 21)	50.0
  (98548, 4)	1.0
  (98548, 7)	1.0
  (98548, 16)	1.0
  (98548, 21)	45.0
  (98549, 2)	1.0
  (98549, 16)	1.0
  (98549, 20)	1.0
  (98549, 21)	27.0
  (98550, 2)	1.0
  (98550, 8)	1.0
  (98550, 10)	1.0
  (98550, 20)	1.0
  (98550, 21)	63.0
  (98551, 5)	1.0
  (98551, 10)	1.0
  (98551, 20)	1.0
  (98551

Unnamed: 0,category_Clothing,category_Cosmetics,category_Food & Beverage,category_Shoes,category_Souvenir,category_Technology,category_Toys,payment_method_Credit Card,payment_method_Debit Card,month_2,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,gender,age
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,28.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,21.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,20.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,53.0


In [None]:
X_array = X_df.values
print(type(X_array))  # Muestra <class 'numpy.ndarray'>
print(X_array.shape)  # Verifica las dimensiones
print(type(y_array))  # Muestra <class 'numpy.ndarray'>
print(y_array.shape)  # Verifica las dimensiones
y_array = y_array.reshape(-1,1)#Modificamos la dimensión del array para poder escalarlo ya que en un modelo de regresión es mejor escalar el target
print(y_array.shape)


<class 'numpy.ndarray'>
(98553, 22)
<class 'numpy.ndarray'>
(98553,)
(98553, 1)


In [21]:
from sklearn.model_selection import train_test_split
#Realizamos la partición en entrenamiento (80%), validación (10%) y prueba(10%)
x_tr, x_resto, y_tr, y_resto = train_test_split(X_array, y_array, test_size=0.2, random_state=23)

x_vl, x_test, y_vl, y_test = train_test_split(x_resto, y_resto, test_size=0.5, random_state=23) 

print(x_tr.shape,y_tr.shape)
print(x_vl.shape, y_vl.shape)
print(x_test.shape,y_test.shape)

(78842, 22) (78842, 1)
(9855, 22) (9855, 1)
(9856, 22) (9856, 1)


In [22]:
from sklearn.preprocessing import MinMaxScaler

#Minmaxscaler, si no declaro el feature range seria entre 0 y 1 pero aqui le doy el rango porque la longitud es negativa
x_scaler = MinMaxScaler()

#Fit_transform para cálcular máximos y minimos y escalar el set de entrenamiento
x_tr_s = x_scaler.fit_transform(x_tr)

#Algunas características del escalador
print('Características del escalador ajustado:')
print(x_scaler.data_min_, x_scaler.data_max_)

#Resultado
print('Resultado del escalamiento sobre "x_tr":')
print(f' Mínimos: {x_tr_s.min(axis=0)}')
print(f' Máximos: {x_tr_s.max(axis=0)}')

y_scaler = MinMaxScaler()
y_tr_s= y_scaler.fit_transform(y_tr)

#Algunas características del escalador
print('Características del escalador ajustado:')
print(y_scaler.data_min_, y_scaler.data_max_)

#Resultado
print('Resultado del escalamiento sobre "x_tr":')
print(f' Mínimos: {y_tr_s.min(axis=0)}')
print(f' Máximos: {y_tr_s.max(axis=0)}')

#Ahora escalamos los set de validacion y prueba pero con transform porque usa de referencia los valores escalados del set de entrenamiento
x_vl_s = x_scaler.transform(x_vl)
x_ts_s = x_scaler.transform(x_test)

print('Set de validacion:')
print(f'{x_vl_s.min(axis=0)},{x_vl_s.max(axis=0)}')
print('Set de prueba')
print(f'{x_ts_s.min(axis=0)},{x_ts_s.max(axis=0)}')




Características del escalador ajustado:
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0. 18.] [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1. 69.]
Resultado del escalamiento sobre "x_tr":
 Mínimos: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 Máximos: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Características del escalador ajustado:
[0.13598] [682.5]
Resultado del escalamiento sobre "x_tr":
 Mínimos: [0.]
 Máximos: [1.]
Set de validacion:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.],[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Set de prueba
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.],[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [23]:
#Ahora construimos el modelo de regresion lineal
from sklearn.linear_model import LinearRegression
model = LinearRegression()

#Entrenamos el modelo
model.fit(x_tr_s, y_tr_s)

In [24]:
#Hacemos las predicciones con los sets de validacion y prueba, pero escalados

y_vl_pred_s = model.predict(x_vl_s)
y_ts_pred_s = model.predict(x_ts_s)

print(y_vl_pred_s.min(), y_vl_pred_s.max())
print(y_ts_pred_s.min(), y_ts_pred_s.max())

-0.0010984969929706215 0.44189160610887895
-0.0011536464206180876 0.44204426597768526


In [26]:
# Limitar predicciones al rango [0, 1] porque no puede haber una prediccion negativa
y_vl_pred_s_clipped = np.clip(y_vl_pred_s, 0,1)
y_ts_pred_s_clipped = np.clip(y_ts_pred_s, 0,1)

#Debemos usar inverse_transform para realizar el escalamiento inverso

y_vl_pred = y_scaler.inverse_transform(y_vl_pred_s_clipped)
y_ts_pred = y_scaler.inverse_transform(y_ts_pred_s_clipped)

print(y_vl_pred.min(), y_vl_pred.max())
print(y_ts_pred.min(), y_ts_pred.max())

0.13598 301.6669127487112
0.13598 301.7710823504825
