In [149]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from pycaret.regression import *
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [150]:
# 1. Carga de Datos
data = pd.read_csv('train.csv')  


In [151]:
# Visualizar las primeras filas del DataFrame
print(data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [152]:
# 2. Exploración inicial de datos
print("Información general del conjunto de datos:")
print(data.info())

Información general del conjunto de datos:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQu

In [153]:
# División de datos en 80% entrenamiento y 20% prueba
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)


In [154]:
# Imprimir información sobre las formas de los conjuntos de entrenamiento y prueba
print("Forma del conjunto de entrenamiento:", train_data.shape)
print("Forma del conjunto de prueba:", test_data.shape)


Forma del conjunto de entrenamiento: (1168, 81)
Forma del conjunto de prueba: (292, 81)


In [155]:
# Estadísticas descriptivas
print("Estadísticas descriptivas:")
print(data.describe())

Estadísticas descriptivas:
                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   

In [156]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
categorical_data = data.select_dtypes(include=['object'])

encoded_data = categorical_data.apply(le.fit_transform)
numeric_data = data.select_dtypes(include=[np.number])

data_encoded = pd.concat([numeric_data, encoded_data], axis=1)
correlation_matrix = data_encoded.corr()

In [157]:
# Pipeline para la ingeniería de características
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data['SalePrice']

In [158]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns


In [159]:
# Imputación de variables numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [160]:
# Imputación de variables categóricas y codificación
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [161]:
# Combinar transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [162]:
# Modelo para selección de características basado en Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
feature_selector = SelectFromModel(model)

In [163]:
# Eliminación de outliers con Isolation Forest
outlier_detector = IsolationForest(contamination=0.05)

# Crear pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('outlier_detector', outlier_detector),
    ('feature_selector', feature_selector)
])

In [164]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Supongamos que 'X_train' es tu conjunto de datos de características y 'y_train' es tu conjunto de datos objetivo

# Crear un transformador para codificar las columnas categóricas usando OneHotEncoder
categorical_features = ['OpenPorchSF', 'OverallQual']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combinar los transformadores en un preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Crear el pipeline con el preprocesador y el estimador final
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('estimator', LinearRegression())])

# Ajustar el pipeline a los datos
pipeline.fit(X_train, y_train)

In [165]:
# Configuración de PyCaret
exp1 = setup(data, target='SalePrice', session_id=42, numeric_features=numeric_features.tolist(), ordinal_features={})


Unnamed: 0,Description,Value
0,Session id,42
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 81)"
4,Transformed data shape,"(1460, 280)"
5,Transformed train set shape,"(1021, 280)"
6,Transformed test set shape,"(439, 280)"
7,Ordinal features,4
8,Numeric features,37
9,Categorical features,43


In [166]:
# Comparación de modelos
best_models = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,17090.9279,981932534.3781,30153.6025,0.8405,0.1389,0.0979,0.387
lightgbm,Light Gradient Boosting Machine,17548.0422,988254053.2139,30322.7254,0.8382,0.1389,0.0983,0.36
rf,Random Forest Regressor,18450.8605,1055867698.9889,31526.6534,0.8256,0.1508,0.1078,0.768
llar,Lasso Least Angle Regression,17717.9752,1176214277.9389,31279.1126,0.8121,0.1516,0.1052,0.205
et,Extra Trees Regressor,19563.4355,1234132700.1198,33718.4989,0.802,0.1558,0.1124,0.617
ridge,Ridge Regression,19090.8429,1267966543.2878,33275.5069,0.7959,0.1653,0.1126,0.201
en,Elastic Net,19929.157,1519646843.7745,35949.3565,0.7634,0.1634,0.1131,0.268
omp,Orthogonal Matching Pursuit,21285.9734,1526956058.6433,36302.6817,0.7591,0.1892,0.1244,0.214
ada,AdaBoost Regressor,25315.3198,1570041249.2822,38849.1981,0.7403,0.2057,0.1627,0.323
br,Bayesian Ridge,24901.6471,2011260231.9853,42261.098,0.6813,0.2023,0.1421,0.22


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [167]:
# Seleccionar los tres mejores modelos
top3_models = best_models[:3]

In [168]:
X_train_encoded = pd.get_dummies(X_train)

In [169]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X_train_encoded = X_train.apply(label_encoder.fit_transform)

In [170]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [171]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [173]:
from pycaret.regression import setup, create_model, plot_model

# Importar datos y configurar el entorno
# setup(...)

# Crear el modelo
model = create_model('rf')

# Evaluación del modelo
evaluate_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,19008.1377,1428279713.865,37792.5881,0.8256,0.149,0.1041
1,16710.8031,489011834.5917,22113.612,0.9106,0.1269,0.0991
2,22510.1676,1878413848.8779,43340.672,0.6426,0.1868,0.1346
3,23412.2927,2092119751.7486,45739.6956,0.723,0.1902,0.1308
4,20149.1013,1179899216.0203,34349.6611,0.838,0.1871,0.1316
5,15474.7482,662973708.4364,25748.2758,0.8785,0.1417,0.0969
6,18901.2115,822847660.2249,28685.3213,0.8281,0.1447,0.1076
7,16159.3879,631470512.1386,25129.077,0.8921,0.1357,0.0968
8,16553.9796,745101879.9366,27296.5544,0.8232,0.141,0.0953
9,15628.7749,628558864.0489,25071.0762,0.8939,0.1052,0.0811


<Figure size 2200x600 with 0 Axes>

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [174]:
# Obtener datos de prueba
X_test = test_data.drop('SalePrice', axis=1)
y_test = test_data['SalePrice']


In [177]:
print(type(model))

<class 'numpy.ndarray'>


In [191]:
from sklearn.linear_model import LinearRegression

In [192]:
# Crear una instancia del modelo de regresión lineal
model = LinearRegression()

In [198]:
# Crear una instancia de LabelEncoder
encoder = LabelEncoder()

In [199]:
# Convertir las variables categóricas en numéricas
X_train_encoded = X_train.copy()  # Crear una copia de X_train
for column in X_train_encoded.columns:
    if X_train_encoded[column].dtype == 'object':
        X_train_encoded[column] = encoder.fit_transform(X_train_encoded[column])

In [201]:
from sklearn.impute import SimpleImputer

# Crear el imputador
imputer = SimpleImputer(strategy='mean')

# Aplicar el imputador a los datos de entrada
X_train_imputed = imputer.fit_transform(X_train_encoded)

# Entrenar el modelo con los datos imputados
model.fit(X_train_imputed, y_train)

In [243]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [244]:
# Crear un modelo de regresión lineal
model = LinearRegression()

In [248]:
y_pred = model.predict(X_test)

ValueError: could not convert string to float: 'RL'

In [250]:
# Entrenar el modelo con los datos de entrenamiento (X_train, y_train)
model.fit(X_train, y_train)

# Hacer predicciones en los datos de prueba (X_test)
y_pred = model.predict(X_test)

# Calcular el coeficiente de determinación R-squared
r2 = r2_score(y_test, y_pred)

ValueError: could not convert string to float: 'RL'

In [251]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

ValueError: could not convert string to float: 'RL'

In [225]:
# RMSE (Error cuadrático medio)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [226]:
# MSE (Error cuadrático medio)
mse = mean_squared_error(y_test, y_pred)


NameError: name 'y_pred' is not defined

In [227]:
# MAPE (Error porcentual absoluto medio)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

NameError: name 'y_pred' is not defined

In [254]:
# Imprimir las métricas
print(f'R2: {r2}')
print(f'RMSE: {rmse}')
print(f'MSE: {mse}')
print(f'MAPE: {mape}%')

NameError: name 'r2' is not defined