# Laboratorio 1

In [43]:
import pandas as pd
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
import numpy as np
import yaml

## Exploracion y Visualizacion de Datos

In [28]:
with open('data/params.yaml', 'r') as file:
    params = yaml.safe_load(file)

NameError: name '__file__' is not defined

In [3]:
data = pd.read_csv('data/data.csv')

print(data.head())

missing_values = data.isnull().sum()
print("Valores faltantes por columna:")
print(missing_values)

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
Valores faltantes por columna:
price               0
area                0
bedrooms            0
ba

## Procesamiento de datos

In [9]:
X = data[params['preprocessing']['features']]
y = data[params['preprocessing']['target']]

In [10]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object', 'category']).columns

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ]
)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=params['train']['test_size'], 
    random_state=params['train']['random_state']
)

In [13]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [14]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [15]:
print("Tamaño de X_train:", X_train.shape)
print("Tamaño de X_test:", X_test.shape)

Tamaño de X_train: (436, 5)
Tamaño de X_test: (109, 5)


In [21]:
models = {
    'linear_regression': LinearRegression(**params['train']['models']['linear_regression']),
    'random_forest': RandomForestRegressor(**params['train']['models']['random_forest']),
    'gradient_boosting': GradientBoostingRegressor(**params['train']['models']['gradient_boosting'])
}

In [24]:
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"models/{model_name}.pkl")  # Cambia la ruta si es necesario
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    results[model_name] = mse
    print(f"{model_name} entrenado. MSE: {mse}")


linear_regression entrenado. MSE: 2292721545725.366
random_forest entrenado. MSE: 2624181958775.779
gradient_boosting entrenado. MSE: 2400360173639.197


In [23]:
best_model = min(results, key=results.get)
print("El mejor modelo es:", best_model, "con un MSE de:", results[best_model])

El mejor modelo es: linear_regression con un MSE de: 2292721545725.366


## Optimizacion y validacion cruzada

In [29]:
param_grid = {
    'linear_regression': {
        'fit_intercept': [True, False],
        'n_jobs': [None, -1]
    },
    'random_forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'gradient_boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

In [30]:
models = {
    'linear_regression': LinearRegression(),
    'random_forest': RandomForestRegressor(random_state=params['train']['random_state']),
    'gradient_boosting': GradientBoostingRegressor(random_state=params['train']['random_state'])
}

In [31]:
best_models = {}
results = {}

In [37]:
for model_name, model in models.items():
    print(f"Optimizando {model_name}...")
    
    if model_name == 'linear_regression':
        search = GridSearchCV(
            model, param_grid[model_name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1
        )
    else:
        search = RandomizedSearchCV(
            model, param_grid[model_name], n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=params['train']['random_state'], n_jobs=-1
        )
    
    search.fit(X_train, y_train)
    best_models[model_name] = search.best_estimator_
    cv_score = cross_val_score(search.best_estimator_, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    results[model_name] = {
        'best_params': search.best_params_,
        'cv_score': -np.mean(cv_score)
    }
    print(f"Mejores hiperparámetros para {model_name}: {search.best_params_}")
    print(f"Error de validación cruzada (MSE): {-np.mean(cv_score)}")

Optimizando linear_regression...
Mejores hiperparámetros para linear_regression: {'fit_intercept': True, 'n_jobs': None}
Error de validación cruzada (MSE): 1414425048943.1414
Optimizando random_forest...
Mejores hiperparámetros para random_forest: {'n_estimators': 50, 'min_samples_split': 10, 'max_depth': 20}
Error de validación cruzada (MSE): 1487186680444.4905
Optimizando gradient_boosting...
Mejores hiperparámetros para gradient_boosting: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}
Error de validación cruzada (MSE): 1530581551298.8262


In [42]:
# Identificar el mejor modelo basado en el resultado de la validación cruzada
best_model_name = min(results, key=lambda k: results[k]['cv_score'])
mejor_modelo = best_models[best_model_name]

print("\nEl mejor modelo es:", best_model_name)
print("Con MSE de validación cruzada:", results[best_model_name]['cv_score'])
print("Mejores hiperparámetros:", results[best_model_name]['best_params'])

# Evaluar el mejor modelo y calcular las métricas
metrics, predictions = evaluate_model(mejor_modelo, X_test, y_test, is_classification=False)

# Mostrar las métricas de rendimiento
print("\nMétricas de rendimiento del mejor modelo:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Obtener la importancia de características si es aplicable
if hasattr(mejor_modelo, 'feature_importances_'):
    feature_importances = mejor_modelo.feature_importances_
    features_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    print("\nImportancia de características:")
    print(features_df)
else:
    print("\nEl modelo seleccionado no proporciona información de importancia de características.")

# Exportar los resultados a CSV y Markdown
results_df = pd.DataFrame({
    'Metric': list(metrics.keys()),
    'Value': list(metrics.values())
})
results_csv_path = 'results/model_metrics.csv'
results_md_path = 'results/model_metrics.md'

# Crear la carpeta 'results' si no existe
if not os.path.exists('results'):
    os.makedirs('results')

# Exportar a CSV
results_df.to_csv(results_csv_path, index=False)
print(f"\nResultados exportados a {results_csv_path}")

# Exportar a Markdown
with open(results_md_path, 'w') as f:
    f.write(results_df.to_markdown(index=False))
print(f"Resultados exportados a {results_md_path}")

# Exportar la importancia de características si existe
if hasattr(mejor_modelo, 'feature_importances_'):
    feature_importances_csv_path = 'results/feature_importances.csv'
    features_df.to_csv(feature_importances_csv_path, index=False)
    print(f"Importancia de características exportada a {feature_importances_csv_path}")



El mejor modelo es: linear_regression
Con MSE de validación cruzada: 1414425048943.1414
Mejores hiperparámetros: {'fit_intercept': True, 'n_jobs': None}


NameError: name 'r2_score' is not defined