In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=1
)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission2.csv', index=False)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Dividir el conjunto de datos en entrenamiento y prueba para evaluar el modelo
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Entrenar el modelo con los datos de entrenamiento
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)

# Predicciones en el conjunto de validación
y_pred = model.predict(X_val)

# Calcular métricas de evaluación
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

# Mostrar resultados
print(f"Precisión del modelo: {accuracy:.4f}")
print("\nMatriz de confusión:")
print(conf_matrix)
print("\nInforme de clasificación:")
print(class_report)


Precisión del modelo: 0.7598

Matriz de confusión:
[[90 16]
 [27 46]]

Informe de clasificación:
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       106
           1       0.74      0.63      0.68        73

    accuracy                           0.76       179
   macro avg       0.76      0.74      0.74       179
weighted avg       0.76      0.76      0.76       179



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=1), param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X, y)

print("Mejores hiperparámetros:", grid_search.best_params_)


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tota

In [None]:
# ==============================================================================
# ESTA ES LA CONTINUACIÓN DE TU SCRIPT DESPUÉS DE LA SECCIÓN DE PREPROCESAMIENTO
# Y LA ELIMINACIÓN DE LA COLUMNA 'Id' TANTO EN EL CONJUNTO DE ENTRENAMIENTO
# COMO EN EL DE PRUEBA.
# ==============================================================================

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
# Asegúrate de que las variables 'df', 'target_variable_name', 'scaler',
# 'X', 'y', 'columns_to_drop', 'non_numeric_cols', 'features_to_normalize'
# y 'columns_to_drop_low_corr' estén definidas de las celdas anteriores.

# Si no están definidas, necesitarías pasar los resultados de tu preprocesamiento
# anterior (como df_scaled) a estas secciones, o ejecutar todo el script.

print("\n--- Iniciando entrenamiento y evaluación del modelo ---")

# --- División de datos de entrenamiento y validación ---
# X e y ya deberían estar definidos de tu preprocesamiento anterior,
# donde X es df_scaled sin 'SalePrice' y y es 'SalePrice'.
# Asegúrate que X no contenga 'Id' en este punto.
print(f"\nDimensiones de X antes del split: {X.shape}")
print(f"Dimensiones de y antes del split: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"\nDimensiones de X_train: {X_train.shape}")
print(f"Dimensiones de X_test: {X_test.shape}")
print(f"Dimensiones de y_train: {y_train.shape}")
print(f"Dimensiones de y_test: {y_test.shape}")


# --- Inicialización y Entrenamiento del Modelo Random Forest (Modelo Base) ---
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

print("\n--- Modelo Random Forest Regressor inicializado (Base) ---")
print(model)

print("\n--- Entrenando el modelo base ---")
model.fit(X_train, y_train)

# --- Evaluación del Modelo Base ---
print("\n--- Evaluando el modelo base en el conjunto de prueba ---")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # Calcula la raíz cuadrada del MSE
r2 = r2_score(y_test, y_pred)

print("\n--- Resultados de la Evaluación del Modelo Base ---")
print(f"Error Absoluto Medio (MAE): {mae:.2f}")
print(f"Error Cuadrático Medio (MSE): {mse:.2f}")
print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse:.2f}")
print(f"Coeficiente de Determinación (R2 Score): {r2:.2f}")


# --- Optimización de Hiperparámetros con GridSearchCV ---
print("\n--- Iniciando búsqueda de hiperparámetros con GridSearchCV ---")

param_grid = {
    'n_estimators': [100, 200], # Probar 100 y 200 árboles
    'max_features': ['sqrt', 'log2', 0.8], # Probar sqrt(n_features), log2(n_features), 80% de features
    'max_depth': [10, 20], # Probar profundidad 10, 20, o sin límite (None)
    'min_samples_split': [2, 5] # Probar mínimo 2 y 5 muestras para dividir
}

base_model = RandomForestRegressor(random_state=42, n_jobs=-1) # Se usa el modelo base para GridSearchCV

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5, # Validación cruzada de 5 pliegues
    scoring='neg_mean_squared_error', # Optimizar para minimizar el MSE (negativo porque GridSearchCV maximiza)
    verbose=2, # Muestra detalles del progreso
    n_jobs=-1 # Usa todos los núcleos de CPU para paralelizar la búsqueda
)

print("--- Ejecutando GridSearchCV. Esto puede tomar un tiempo... ---")
grid_search.fit(X_train, y_train) # GridSearchCV se ajusta solo al conjunto de entrenamiento

print("\n--- Búsqueda de hiperparámetros completada ---")

print("\nMejores parámetros encontrados por GridSearchCV:")
print(grid_search.best_params_)

print(f"\nMejor MSE (Validación Cruzada): {-grid_search.best_score_:.2f}") # Se invierte el signo para ver el MSE real


# --- Evaluación del Mejor Modelo Encontrado por GridSearchCV ---
best_model = grid_search.best_estimator_ # Este es el modelo con los mejores hiperparámetros
print("\n--- Mejor modelo encontrado ---")
print(best_model)

print("\n--- Evaluando el mejor modelo en el conjunto de prueba ---")
y_pred_tuned = best_model.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print("\n--- Resultados de la Evaluación del Mejor Modelo ---")
print(f"Error Absoluto Medio (MAE) ajustado: {mae_tuned:.2f}")
print(f"Error Cuadrático Medio (MSE) ajustado: {mse_tuned:.2f}")
print(f"Raíz del Error Cuadrático Medio (RMSE) ajustado: {rmse_tuned:.2f}")
print(f"Coeficiente de Determinación (R2 Score) ajustado: {r2_tuned:.2f}")


# ==============================================================================
# PARTE PARA GENERAR PREDICCIONES EN EL DATASET DE TEST DE KAGGLE Y CREAR EL CSV
# ==============================================================================

from google.colab import files # Para descargar el archivo, si es necesario

print("\n--- Preparando el dataset de test de Kaggle para la predicción ---")

# --- IMPORTANT: Make sure 'test.csv' is in your Google Drive ---
# Define the path to your test data
test_file_path = '/content/drive/MyDrive/Colab Notebooks/Housing Prices Kaggle Competition/test.csv'

# Load the test dataset
df_test = pd.read_csv(test_file_path)

print(f"\nOriginal Test DataFrame shape: {df_test.shape}")

# Store the IDs for submission later
test_ids = df_test['Id'] # GOOD: Store IDs here
df_test.drop(columns=['Id'], inplace=True, errors='ignore') # GOOD: Drop 'Id' from df_test early


# --- Aplicar los MISMOS pasos de preprocesamiento que el TRAIN data ---

# 1. Handle missing values for columns dropped (more than 80% NaNs)
# 'columns_to_drop' should be defined from your training data analysis
df_test.drop(columns=columns_to_drop, inplace=True, errors='ignore')
print(f"Test DataFrame after dropping high-missing columns: {df_test.shape}")


# 2. Impute missing numeric values with the MEAN (from TRAINING data)
numeric_cols_train_for_imputation = df.select_dtypes(include=['number']).columns
if 'SalePrice' in numeric_cols_train_for_imputation:
    numeric_cols_train_for_imputation = numeric_cols_train_for_imputation.drop('SalePrice')
# Asumiendo que 'Id' ya fue droppeado del 'df' de entrenamiento
if 'Id' in numeric_cols_train_for_imputation:
    numeric_cols_train_for_imputation = numeric_cols_train_for_imputation.drop('Id')

train_numeric_means = df[numeric_cols_train_for_imputation].mean()

numeric_cols_test = df_test.select_dtypes(include=['number']).columns
for col in numeric_cols_test:
    if df_test[col].isna().any():
        if col in train_numeric_means:
            df_test[col].fillna(train_numeric_means[col], inplace=True)
        else: # Fallback for columns not in training, should be rare if data similar
            df_test[col].fillna(df_test[col].mean(), inplace=True)

# Impute non-numeric NaNs with mode (from TRAINING data)
non_numeric_cols_train = df.select_dtypes(exclude=['number']).columns.tolist() # From original df (train)
for col in non_numeric_cols_train:
    if col in df_test.columns and df_test[col].isna().any():
        mode_value = df[col].mode()[0]
        df_test[col].fillna(mode_value, inplace=True)

print(f"Test DataFrame after imputation (NaNs sum): {df_test.isna().sum().sum()}")


# 3. One-Hot Encoding
categorical_cols_train = df.select_dtypes(include=['object', 'category']).columns.tolist()

df_test_encoded = pd.get_dummies(df_test, columns=categorical_cols_train, drop_first=True)

# Align columns - CRUCIAL for consistency between training and test sets
# 'X.columns' represents the final feature columns used for training the model (from df_scaled after drops)
train_features_cols = X.columns

df_test_aligned = df_test_encoded.reindex(columns=train_features_cols, fill_value=0)

print(f"Test DataFrame after One-Hot Encoding and column alignment: {df_test_aligned.shape}")


# 4. Scaling
# 'scaler' object fitted on training data. 'features_to_normalize' list from training data.
features_to_scale_in_test = [col for col in features_to_normalize if col in df_test_aligned.columns]

df_test_scaled = df_test_aligned.copy()
df_test_scaled[features_to_scale_in_test] = scaler.transform(df_test_aligned[features_to_scale_in_test])

print(f"Test DataFrame after scaling: {df_test_scaled.shape}")


# 5. Drop low correlation columns (consistent with training set)
df_test_scaled.drop(columns=columns_to_drop_low_corr, inplace=True, errors='ignore')

print(f"Test DataFrame after dropping low-correlation columns: {df_test_scaled.shape}")

# --- Asegurar que las columnas del test set final coincidan exactamente con X_train ---
# 'X_train.columns' representa el orden y el conjunto final de columnas que el modelo espera.
# 'df_test_final' se crea seleccionando y reordenando las columnas de 'df_test_scaled'
# para que coincidan con las de 'X_train'. Esto es la validación final antes de la predicción.
df_test_final = df_test_scaled[X_train.columns]

print(f"\nFinal Test DataFrame columns match training features: {list(df_test_final.columns) == list(X_train.columns)}")
print(f"Final Test DataFrame shape for prediction: {df_test_final.shape}")

# --- Generar predicciones en el dataset de test de Kaggle ---
kaggle_predictions = best_model.predict(df_test_final)

print(f"\nNúmero de predicciones generadas para Kaggle test set: {len(kaggle_predictions)}")


# --- Crear el archivo de Submission para Kaggle ---
submission_df = pd.DataFrame({
    'Id': test_ids, # IDs guardados del df_test original
    'SalePrice': kaggle_predictions # Predicciones de tu modelo
})

print("\n--- Final Kaggle Submission DataFrame (primeras 5 filas) ---")
print(submission_df.head())
print(f"Final Kaggle Submission DataFrame shape: {submission_df.shape}")

# Define la ruta de salida para el archivo de submission
submission_file_path = '/content/drive/MyDrive/Colab Notebooks/Housing Prices Kaggle Competition/submission.csv'

# Guarda el DataFrame en un archivo CSV sin el índice
submission_df.to_csv(submission_file_path, index=False)

print(f"\n--- Archivo de submission guardado exitosamente en: {submission_file_path} ---")

# Para descargar el archivo directamente a tu máquina local desde Colab (opcional)
# files.download(submission_file_path)

print("\n--- Script completado. ¡Puedes subir 'submission.csv' a Kaggle! ---")


--- Iniciando entrenamiento y evaluación del modelo ---


NameError: name 'X' is not defined