# Regresion logistica: Ventas

En este ejercicio se muestran los fundamentos de la regresion logistica planteando uno de los primeros problemas que fueron solucionados mediante el uso de técnicas de Machine Learning: Predicción de categoría de producto

## Enunciado del ejercicio 

Se propone la construcción de un sistema de aprendizaje automatico capaz de predecir la categoria de un producto en ventas de un DataSet 

In [2]:
# Importar las bibliotecas necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
# Cargar el dataset
file_path = "/home/abril/anaconda3/envs/Simulacion/datasets/datasets/sales_dataset_50000.csv" 
df = pd.read_csv(file_path)

In [14]:
features = ["Quantity_Sold", "Unit_Price", "Discount", "Region", "Payment_Method"]


In [15]:
print(df.columns)

Index(['Transaction_ID', 'Date', 'Product', 'Category', 'Quantity_Sold',
       'Unit_Price', 'Discount', 'Region', 'Payment_Method', 'Total'],
      dtype='object')


In [20]:
print(features)


['Quantity_Sold', 'Unit_Price', 'Discount']


In [22]:
features = ['Region', 'Payment_Method']  # Asegúrate de incluir todas las columnas necesarias


In [23]:
print(df.columns.tolist())


['Transaction_ID', 'Date', 'Product', 'Category', 'Quantity_Sold', 'Unit_Price', 'Discount', 'Region', 'Payment_Method', 'Total']


In [26]:
# Definir las características (columnas relevantes)
features = ["Quantity_Sold", "Unit_Price", "Discount", "Region", "Payment_Method"]

# Codificar variables categóricas (One-Hot Encoding)
df_encoded = pd.get_dummies(df[features], columns=["Region", "Payment_Method"], drop_first=True)


In [27]:
features = ["Quantity_Sold", "Unit_Price", "Discount"]  # Si faltan "Region" o "Payment_Method"


In [30]:
print(df[['Region', 'Payment_Method']].head())


  Region Payment_Method
0  South           Cash
1  North    Credit Card
2   West    Credit Card
3   West    Credit Card
4   East           Cash


In [31]:
df_encoded = pd.get_dummies(df, columns=["Region", "Payment_Method"], drop_first=True)


In [33]:
target = 'NombreDeLaColumnaDeEtiquetas'  # Reemplaza con el nombre correcto de la columna


In [34]:
print(df.columns)


Index(['Transaction_ID', 'Date', 'Product', 'Category', 'Quantity_Sold',
       'Unit_Price', 'Discount', 'Region', 'Payment_Method', 'Total'],
      dtype='object')


In [37]:
print(df.head())

                         Transaction_ID        Date     Product     Category  \
0  4c98f095-7aa9-4e73-a94b-a79e84c83cc7  2024-12-07  Smartphone  Electronics   
1  68ff3320-6da2-4747-8851-b6aa687445ab  2023-05-10     Monitor  Peripherals   
2  b48d6d6a-fdf3-4f0d-9da5-6c511bf016d5  2024-03-02      Laptop  Electronics   
3  ac017db5-a08e-471c-a685-bec091e29345  2023-12-01  Headphones  Accessories   
4  571c3ded-3a36-451b-940a-dd489d1c303b  2024-12-31      Tablet  Electronics   

   Quantity_Sold  Unit_Price  Discount Region Payment_Method    Total  
0              9      958.94      0.15  South           Cash  7335.89  
1              4     1045.36      0.39  North    Credit Card  2550.68  
2              7     1929.63      0.49   West    Credit Card  6888.78  
3              9      710.64      0.41   West    Credit Card  3773.50  
4              2       14.76      0.01   East           Cash    29.22  


In [40]:
target = "Total"  # Reemplaza con el nombre exacto de la columna

In [41]:
print(df[target].isnull().sum())  # Esto mostrará cuántos valores nulos hay


0


In [42]:
print(df_encoded.head())
print(df.head())


                         Transaction_ID        Date     Product     Category  \
0  4c98f095-7aa9-4e73-a94b-a79e84c83cc7  2024-12-07  Smartphone  Electronics   
1  68ff3320-6da2-4747-8851-b6aa687445ab  2023-05-10     Monitor  Peripherals   
2  b48d6d6a-fdf3-4f0d-9da5-6c511bf016d5  2024-03-02      Laptop  Electronics   
3  ac017db5-a08e-471c-a685-bec091e29345  2023-12-01  Headphones  Accessories   
4  571c3ded-3a36-451b-940a-dd489d1c303b  2024-12-31      Tablet  Electronics   

   Quantity_Sold  Unit_Price  Discount    Total  Region_North  Region_South  \
0              9      958.94      0.15  7335.89         False          True   
1              4     1045.36      0.39  2550.68          True         False   
2              7     1929.63      0.49  6888.78         False         False   
3              9      710.64      0.41  3773.50         False         False   
4              2       14.76      0.01    29.22         False         False   

   Region_West  Payment_Method_Cash  Payment

In [43]:
# Separar características (X) y etiquetas (y)
X = df_encoded
y = df[target]

In [44]:
 #Dividir los datos en conjuntos de entrenamiento y prueba (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
print(X_train.head())
print(X_train.dtypes)

                             Transaction_ID        Date     Product  \
39087  d4919b5c-0e08-4d9c-abef-90b48ca26df0  2023-05-04      Tablet   
30893  40e1c876-bb21-4bb4-aba6-0db244b29976  2023-04-08     Monitor   
45278  33d17903-a264-4893-86ab-9087e794160c  2023-10-13  Smartphone   
16398  791a5e43-fcf0-43c0-93d9-d2ab88cc701d  2024-03-22      Tablet   
13653  dc61f326-21a5-4b2c-a618-c903aaf9e744  2024-03-26     Monitor   

          Category  Quantity_Sold  Unit_Price  Discount    Total  \
39087  Electronics              2      757.13      0.20  1211.41   
30893  Electronics              1     1158.47      0.08  1065.79   
45278  Accessories              6     1437.02      0.10  7759.91   
16398  Peripherals              6     1225.54      0.23  5661.99   
13653  Peripherals              3      528.60      0.45   872.19   

       Region_North  Region_South  Region_West  Payment_Method_Cash  \
39087         False         False         True                False   
30893         False   

In [47]:
from sklearn.compose import make_column_selector

num_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train_numeric = X_train[num_columns]
X_test_numeric = X_test[num_columns]


In [51]:
for col in X_train.columns:
    try:
        X_train[col].astype(float)
    except ValueError:
        print(f"Columna con valores no convertibles: {col}")
        print(X_train[col].unique())


Columna con valores no convertibles: Transaction_ID
['d4919b5c-0e08-4d9c-abef-90b48ca26df0'
 '40e1c876-bb21-4bb4-aba6-0db244b29976'
 '33d17903-a264-4893-86ab-9087e794160c' ...
 '182eb2e2-32af-409f-9b29-a0912c9c8e5e'
 'ed9f9d5e-ce10-4daf-a367-a50bb8dab049'
 '23647335-54ef-440e-8163-f82f45f950f0']
Columna con valores no convertibles: Date
['2023-05-04' '2023-04-08' '2023-10-13' '2024-03-22' '2024-03-26'
 '2023-05-21' '2024-12-04' '2024-03-29' '2023-02-12' '2024-10-05'
 '2024-05-16' '2024-12-31' '2024-05-03' '2023-10-21' '2023-04-16'
 '2024-05-26' '2024-07-22' '2024-05-19' '2023-06-15' '2023-05-03'
 '2023-09-10' '2024-08-05' '2023-06-01' '2024-11-12' '2023-09-05'
 '2023-12-05' '2023-09-22' '2023-07-28' '2024-02-27' '2023-11-30'
 '2023-11-16' '2023-05-27' '2024-01-25' '2023-05-10' '2023-09-11'
 '2023-05-02' '2024-06-25' '2023-02-23' '2024-07-21' '2023-12-20'
 '2024-06-17' '2023-11-22' '2024-04-16' '2024-07-06' '2024-09-11'
 '2024-02-03' '2023-04-10' '2024-01-02' '2024-01-03' '2024-01-29'
 

In [53]:
print(X_train.dtypes)


Transaction_ID                    object
Date                              object
Product                           object
Category                          object
Quantity_Sold                      int64
Unit_Price                       float64
Discount                         float64
Total                            float64
Region_North                        bool
Region_South                        bool
Region_West                         bool
Payment_Method_Cash                 bool
Payment_Method_Credit Card          bool
Payment_Method_Digital Wallet       bool
dtype: object


In [54]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"])
X_train_scaled = scaler.fit_transform(numeric_features)


In [55]:
print(X_train.isnull().sum())  # Verifica valores nulos
print(X_train.head())  # Inspecciona visualmente los datos


Transaction_ID                   0
Date                             0
Product                          0
Category                         0
Quantity_Sold                    0
Unit_Price                       0
Discount                         0
Total                            0
Region_North                     0
Region_South                     0
Region_West                      0
Payment_Method_Cash              0
Payment_Method_Credit Card       0
Payment_Method_Digital Wallet    0
dtype: int64
                             Transaction_ID        Date     Product  \
39087  d4919b5c-0e08-4d9c-abef-90b48ca26df0  2023-05-04      Tablet   
30893  40e1c876-bb21-4bb4-aba6-0db244b29976  2023-04-08     Monitor   
45278  33d17903-a264-4893-86ab-9087e794160c  2023-10-13  Smartphone   
16398  791a5e43-fcf0-43c0-93d9-d2ab88cc701d  2024-03-22      Tablet   
13653  dc61f326-21a5-4b2c-a618-c903aaf9e744  2024-03-26     Monitor   

          Category  Quantity_Sold  Unit_Price  Discount    Total  \
39

In [58]:
print(X_train.dtypes)


Transaction_ID                    object
Date                              object
Product                           object
Category                          object
Quantity_Sold                      int64
Unit_Price                       float64
Discount                         float64
Total                            float64
Region_North                        bool
Region_South                        bool
Region_West                         bool
Payment_Method_Cash                 bool
Payment_Method_Credit Card          bool
Payment_Method_Digital Wallet       bool
dtype: object


In [59]:
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_test_numeric = X_test.select_dtypes(include=['float64', 'int64'])


In [60]:
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)


In [64]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X_train['Unit_Price'] = label_encoder.fit_transform(X_train['Unit_Price'])
X_test['Unit_Price'] = label_encoder.transform(X_test['Unit_Price'])


In [69]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X_train['Discount'] = label_encoder.fit_transform(X_train['Discount'])
X_test['Discount'] = label_encoder.transform(X_test['Discount'])

In [74]:
if 'columna_categorica' in X_train.columns:
    X_train['Unit_Price'] = label_encoder.fit_transform(X_train['Unit_Price'])
    X_test['Unit_Price'] = label_encoder.transform(X_test['Unit_Price'])
else:
    print("La columna 'columna_categorica' no existe.")


La columna 'columna_categorica' no existe.


In [76]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[['Unit_Price']])
X_test_encoded = encoder.transform(X_test[['Unit_Price']])


In [78]:
# Escalar las características numéricas
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [80]:
print(y_train.unique())  # Para ver los valores únicos en y_train


[ 1211.41  1065.79  7759.91 ... 14907.59  5570.38 13799.34]


In [83]:
import pandas as pd
y_train = pd.cut(y_train, bins=[-float('inf'), 0, 10, 20, float('inf')], labels=[0, 1, 2, 3])


In [84]:
from sklearn.linear_model import LinearRegression

# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo
model.fit(X_train_scaled, y_train)


In [85]:
# Entrenar el modelo de regresión logística
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

In [86]:
# Realizar predicciones
y_pred = model.predict(X_test_scaled)


In [88]:
from sklearn.metrics import mean_squared_error, r2_score

# Predicciones del modelo
y_pred = model.predict(X_test_scaled)

# Evaluar el modelo
print("=== Regresión ===")
print("Error cuadrático medio:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


=== Regresión ===
Error cuadrático medio: 29992040.75489026
R²: -1.3355160084490798


In [90]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [92]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [94]:
from sklearn.metrics import mean_squared_error, r2_score

print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MSE: 29992040.75489026
R2 Score: -1.3355160084490798
