In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/mnt/data/HousingData.csv'
data = pd.read_csv(file_path)

# Preprocessing
# Handle missing values
data.fillna(data.mean(), inplace=True)

# Separate features and target
X = data.drop(columns=["target"], errors='ignore')  # Replace "target" with the actual target column name if known
y = data["target"] if "target" in data.columns else data.iloc[:, -1]  # Default to the last column as target

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
}

# Train and evaluate models
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        "r2_score": r2_score(y_test, y_pred),
        "mean_squared_error": mean_squared_error(y_test, y_pred),
        "model": pipeline,
    }

# Feature selection methods
feature_selection_methods = {
    "SelectKBest (f_regression)": SelectKBest(score_func=f_regression, k=5),
    "SelectKBest (mutual_info)": SelectKBest(score_func=mutual_info_regression, k=5),
    "Random Forest Importance": RandomForestRegressor(random_state=42),
}

# Apply feature selection and evaluate models
selected_features_results = {}
for method_name, selector in feature_selection_methods.items():
    if isinstance(selector, SelectKBest):
        X_selected = selector.fit_transform(X, y)
    else:  # Random Forest-based selection
        selector.fit(X, y)
        importances = selector.feature_importances_
        top_features = np.argsort(importances)[-5:]
        X_selected = X.iloc[:, top_features]
    
    # Train/test split for selected features
    X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    for model_name, model in models.items():
        model.fit(X_train_sel, y_train_sel)
        y_pred_sel = model.predict(X_test_sel)
        selected_features_results[(method_name, model_name)] = {
            "r2_score": r2_score(y_test_sel, y_pred_sel),
            "mean_squared_error": mean_squared_error(y_test_sel, y_pred_sel),
        }

# Compare results
print("Base Model Results:")
for model_name, metrics in results.items():
    print(f"{model_name}: R2={metrics['r2_score']:.4f}, MSE={metrics['mean_squared_error']:.4f}")

print("\nFeature Selection Results:")
for (method, model_name), metrics in selected_features_results.items():
    print(f"{method} + {model_name}: R2={metrics['r2_score']:.4f}, MSE={metrics['mean_squared_error']:.4f}")

# Visualize results
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].bar(results.keys(), [metrics["r2_score"] for metrics in results.values()], color="blue")
ax[0].set_title("R2 Scores (Base Models)")
ax[0].set_ylabel("R2 Score")
ax[0].set_xticks(range(len(results)))
ax[0].set_xticklabels(results.keys(), rotation=45)

ax[1].bar(results.keys(), [metrics["mean_squared_error"] for metrics in results.values()], color="orange")
ax[1].set_title("Mean Squared Error (Base Models)")
ax[1].set_ylabel("MSE")
ax[1].set_xticks(range(len(results)))
ax[1].set_xticklabels(results.keys(), rotation=45)

plt.tight_layout()
plt.show()
