### José Antonio Tapia Godínez 
#### CU 214553
##### MAT‐34710‐001  Métodos  de  Gran  Escala  (Arquitectura  de  Productos  de  Datos) 

In [1]:


# Importar las librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV

# Cargar los datos de entrenamiento
df_train = pd.read_csv('../data/train.csv')

# Ingeniería de características
features = ['LotArea', 'OverallQual', 'OverallCond', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr',
            'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

# Transformar las variables sesgadas
df_train['SalePrice'] = np.log(df_train['SalePrice'])

# Normalizar o estandarizar las variables
scaler = StandardScaler()
df_train[features] = scaler.fit_transform(df_train[features])

# Definir las variables dependientes e independientes para el entrenamiento
X_train = df_train[features]
y_train = df_train['SalePrice']

# Crear un imputador con estrategia de reemplazo por la media
imputer = SimpleImputer(strategy='mean')

# Ajustar el imputador a los datos de entrenamiento y transformar los datos de entrenamiento
X_train_imputed = imputer.fit_transform(X_train)

# Selección automática de características
selector = RFECV(LinearRegression(), step=1, cv=5)
X_train_selected = selector.fit_transform(X_train_imputed, y_train)

# Crear y entrenar el modelo XGBoost
model = XGBRegressor()

# Ajuste de hiperparámetros
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_

# Cargar los datos de predicción
predict_path = '../data/test.csv'
df_predict = pd.read_csv(predict_path)

# Preparar los datos de predicción
X_predict = df_predict[features]
X_predict_imputed = imputer.transform(X_predict)
X_predict_selected = selector.transform(X_predict_imputed)

# Hacer predicciones con el modelo
predictions = best_model.predict(X_predict_selected)

# Agregar una columna 'SalePrice' al DataFrame df_predict con las predicciones
df_predict['SalePrice'] = np.exp(predictions)

# Guardar el DataFrame como un archivo CSV
output_path = '../tests/Approved_test_modeled.csv'
df_predict.to_csv(output_path, index=False)

# Definir una función para calcular el MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Evaluar el modelo en los datos de entrenamiento
train_predictions = best_model.predict(X_train_selected)
mse_model = mean_squared_error(y_train, train_predictions)
mape_model = mean_absolute_percentage_error(y_train, train_predictions)
mae_model = mean_absolute_error(y_train, train_predictions)
rmse_model = np.sqrt(mse_model)
r2_model = r2_score(y_train, train_predictions)

# Crear un DataFrame para las métricas
df_metrics = pd.DataFrame({
    'Model': ['XGBoost'],
    'MSE': [mse_model],
    'MAPE': [mape_model],
    'MAE': [mae_model],
    'RMSE': [rmse_model],
    'R2': [r2_model]})

# Mostrar la tabla de métricas
print(df_metrics)


     Model       MSE     MAPE       MAE      RMSE        R2
0  XGBoost  0.014471  0.73261  0.087564  0.120297  0.909243
