In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
file_path = 'HousingData.csv'  # Adjust the file path if needed
data = pd.read_csv(file_path)

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Separate features and target
X = data_imputed.drop('MEDV', axis=1)  # Target variable is 'MEDV'
y = data_imputed['MEDV']

# Standardize the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor(random_state=42)

# Train models
lr_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)

# Evaluate models
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)

print("Linear Regression - MSE:", lr_mse, "R2:", lr_r2)
print("Decision Tree - MSE:", dt_mse, "R2:", dt_r2)

# Apply Feature Selection Techniques

# 1. SelectKBest (f_regression)
kbest = SelectKBest(score_func=f_regression, k=10)
X_kbest_train = kbest.fit_transform(X_train, y_train)
X_kbest_test = kbest.transform(X_test)

# 2. SelectKBest (mutual_info_regression)
mutual_info = SelectKBest(score_func=mutual_info_regression, k=10)
X_mutual_info_train = mutual_info.fit_transform(X_train, y_train)
X_mutual_info_test = mutual_info.transform(X_test)

# 3. PCA
pca = PCA(n_components=10)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

# Train and evaluate models after feature selection
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")
    return mse, r2

print("\nPerformance after Feature Selection:")

print("\nUsing SelectKBest (f_regression):")
evaluate_model("Linear Regression", LinearRegression(), X_kbest_train, X_kbest_test, y_train, y_test)
evaluate_model("Decision Tree", DecisionTreeRegressor(random_state=42), X_kbest_train, X_kbest_test, y_train, y_test)

print("\nUsing SelectKBest (mutual_info_regression):")
evaluate_model("Linear Regression", LinearRegression(), X_mutual_info_train, X_mutual_info_test, y_train, y_test)
evaluate_model("Decision Tree", DecisionTreeRegressor(random_state=42), X_mutual_info_train, X_mutual_info_test, y_train, y_test)

print("\nUsing PCA:")
evaluate_model("Linear Regression", LinearRegression(), X_pca_train, X_pca_test, y_train, y_test)
evaluate_model("Decision Tree", DecisionTreeRegressor(random_state=42), X_pca_train, X_pca_test, y_train, y_test)

# Visualization
results = {
    "Model": ["Linear Regression", "Decision Tree"],
    "Original MSE": [lr_mse, dt_mse],
    "KBest (f_regression) MSE": [
        mean_squared_error(y_test, LinearRegression().fit(X_kbest_train, y_train).predict(X_kbest_test)),
        mean_squared_error(y_test, DecisionTreeRegressor(random_state=42).fit(X_kbest_train, y_train).predict(X_kbest_test)),
    ],
    "KBest (mutual_info_regression) MSE": [
        mean_squared_error(y_test, LinearRegression().fit(X_mutual_info_train, y_train).predict(X_mutual_info_test)),
        mean_squared_error(y_test, DecisionTreeRegressor(random_state=42).fit(X_mutual_info_train, y_train).predict(X_mutual_info_test)),
    ],
    "PCA MSE": [
        mean_squared_error(y_test, LinearRegression().fit(X_pca_train, y_train).predict(X_pca_test)),
        mean_squared_error(y_test, DecisionTreeRegressor(random_state=42).fit(X_pca_train, y_train).predict(X_pca_test)),
    ],
}

# Convert results to DataFrame for visualization
results_df = pd.DataFrame(results)
results_df.set_index("Model", inplace=True)

# Plot results
results_df.plot(kind="bar", figsize=(10, 6))
plt.title("Comparison of Models and Feature Selection Techniques")
plt.ylabel("Mean Squared Error")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
