<a href="https://colab.research.google.com/github/BifoldTide/-Core-Proyecto-1.Parte-final/blob/main/(Core)_Proyecto_1_Parte_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Carga de datos

In [2]:
# Importación de bibliotecas
import pandas as pd
import numpy as np

In [3]:
# Cargar los datos
path = "/content/drive/MyDrive/BBDD SONDA/Retail_Sales_Dates_Clean.csv"
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.1      1000 non-null   int64  
 1   Unnamed: 0        1000 non-null   int64  
 2   Transaction ID    1000 non-null   int64  
 3   Date              1000 non-null   object 
 4   Customer ID       1000 non-null   object 
 5   Gender            1000 non-null   object 
 6   Age               1000 non-null   int64  
 7   Product Category  1000 non-null   object 
 8   Quantity          1000 non-null   int64  
 9   Price per Unit    1000 non-null   int64  
 10  Total Amount      1000 non-null   int64  
 11  Normalized Sales  1000 non-null   float64
 12  Category          903 non-null    object 
 13  Month             1000 non-null   int64  
 14  Year              1000 non-null   int64  
dtypes: float64(1), int64(9), object(5)
memory usage: 117.3+ KB


In [4]:
# Eliminar columnas innecesarias para el modelo
df = df.drop(columns = ["Unnamed: 0.1", "Unnamed: 0", "Transaction ID", "Date", "Customer ID", "Normalized Sales", "Category"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            1000 non-null   object
 1   Age               1000 non-null   int64 
 2   Product Category  1000 non-null   object
 3   Quantity          1000 non-null   int64 
 4   Price per Unit    1000 non-null   int64 
 5   Total Amount      1000 non-null   int64 
 6   Month             1000 non-null   int64 
 7   Year              1000 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 62.6+ KB


### Entrenamiento de modelos

In [5]:
# Orden de datos
X = df.drop(columns = "Total Amount")
y = df["Total Amount"]

# Clasificación de columnas
col_num = ["Age", "Quantity", "Price per Unit", "Month", "Year"]
col_nom = ["Gender", "Product Category"]

In [15]:
# Importar bibliotecas de modelos
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

In [13]:
# entrenamiento de modelos
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 96)

# Preprocesador
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), col_num),
    ("nom", OneHotEncoder(handle_unknown='ignore'), col_nom),
])

# Modelos
pipeline_line = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LinearRegression())
])

pipeline_knn = Pipeline([
    ("preprocessing", preprocessor),
    ("model", KNeighborsRegressor(n_neighbors = 5))
])

pipeline_tree = Pipeline([
    ("preprocessing", preprocessor),
    ("model", DecisionTreeRegressor(max_depth=5, random_state=96))
])

pipeline_forest = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(n_estimators = 100, random_state = 96))
])

pipeline_xgb = Pipeline([
    ("pp", preprocessor),
    ("model", XGBRegressor(random_state=96))
  ])

mode_LGBM = Pipeline([
    ("pp", preprocessor),
    ("model", LGBMRegressor(random_state=96))
  ])

In [None]:
# Entrenar modelos
pipeline_line.fit(X_train, y_train)
pipeline_knn.fit(X_train, y_train)
pipeline_tree.fit(X_train, y_train)
pipeline_forest.fit(X_train, y_train)
pipeline_xgb.fit(X_train, y_train)
mode_LGBM.fit(X_train, y_train)

In [19]:
# Predicciones
line_pred = pipeline_line.predict(X_test)
knn_pred = pipeline_knn.predict(X_test)
tree_pred = pipeline_tree.predict(X_test)
forest_pred = pipeline_forest.predict(X_test)
xgb_pred = pipeline_xgb.predict(X_test)
lgbm_pred = mode_LGBM.predict(X_test)



In [24]:
print(f"Datos lineales: R2 - {r2_score(y_test, line_pred)}, MAE - {mean_absolute_error(y_test, line_pred)}, MSE - {mean_squared_error(y_test, line_pred)}")
print(f"Datos KNN: R2 - {r2_score(y_test, knn_pred)}, MAE - {mean_absolute_error(y_test, knn_pred)}, MSE - {mean_squared_error(y_test, knn_pred)}")
print(f"Datos de árbol: R2 - {r2_score(y_test, tree_pred)}, MAE - {mean_absolute_error(y_test, line_pred)}, MSE - {mean_squared_error(y_test, tree_pred)}")
print(f"Datos bosque: R2 - {r2_score(y_test, forest_pred)}, MAE - {mean_absolute_error(y_test, forest_pred)}, MSE - {mean_squared_error(y_test, forest_pred)}")
print(f"Datos xgb: R2 - {r2_score(y_test, xgb_pred)}, MAE - {mean_absolute_error(y_test, xgb_pred)}, MSE - {mean_squared_error(y_test, xgb_pred)}")
print(f"Datos lgbm: R2 - {r2_score(y_test, lgbm_pred)}, MAE - {mean_absolute_error(y_test, lgbm_pred)}, MSE - {mean_squared_error(y_test, lgbm_pred)}")


Datos lineales: R2 - 0.863740320220074, MAE - 176.01666656022155, MSE - 45786.50606919278
Datos KNN: R2 - 0.9421041620832622, MAE - 84.96, MSE - 19454.384
Datos de árbol: R2 - 1.0, MAE - 176.01666656022155, MSE - 0.0
Datos bosque: R2 - 1.0, MAE - 0.0, MSE - 0.0
Datos xgb: R2 - 1.0, MAE - 7.009887485764921e-05, MSE - 8.716480870418764e-09
Datos lgbm: R2 - 0.9999999992858779, MAE - 0.012625084894778269, MSE - 0.00023996208657656252


En este caso, el mejor modelo es RandomForest: 1, 0, 0