In [1]:
# Install required packages if not already installed
!pip install lime shap -q

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import lime
import lime.lime_tabular
import shap

# Function to load and preprocess the data
# Returns preprocessed X_train, X_test, y_train, y_test, preprocessor, feature_names, and numerical df for correlation
def load_and_preprocess_data(file_path='listings.csv'):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Handle missing or non-numeric prices
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df = df.dropna(subset=['price'])

    # Fill missing values in other columns
    df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
    df['number_of_reviews_ltm'] = df['number_of_reviews_ltm'].fillna(0)
    df['availability_365'] = df['availability_365'].fillna(0)
    df = df.fillna(0)  # General fill for any remaining

    # Select features
    numerical_features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
                          'reviews_per_month', 'calculated_host_listings_count',
                          'availability_365', 'number_of_reviews_ltm']
    categorical_features = ['neighbourhood_group', 'neighbourhood', 'room_type']

    # Data for correlation (numerical + target)
    df_num = df[numerical_features + ['price']]

    # Features and target
    X = df[numerical_features + categorical_features]
    y = df['price']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ])

    # Fit preprocessor
    X_train_pre = preprocessor.fit_transform(X_train)
    X_test_pre = preprocessor.transform(X_test)

    # Get feature names after preprocessing
    feature_names = preprocessor.get_feature_names_out()

    return X_train_pre, X_test_pre, y_train, y_test, preprocessor, feature_names, df_num

# Function for A1: Compute correlation matrix
# Returns the correlation matrix
def compute_correlation(df_num):
    corr_matrix = df_num.corr()
    return corr_matrix

# Function for A2/A3: Perform PCA with specified variance threshold
# Returns reduced X_train, reduced X_test, number of components, cumulative explained variance
def perform_pca(X_train_pre, X_test_pre, variance_threshold=0.99):
    pca = PCA()
    pca.fit(X_train_pre)
    cum_var = np.cumsum(pca.explained_variance_ratio_)
    n_comp = np.where(cum_var >= variance_threshold)[0][0] + 1
    pca = PCA(n_components=n_comp)
    X_train_reduced = pca.fit_transform(X_train_pre)
    X_test_reduced = pca.transform(X_test_pre)
    return X_train_reduced, X_test_reduced, n_comp, cum_var

# Function to run regression models and evaluate
# Returns a dictionary of model results with MSE and R2
def run_regression_models(X_train, y_train, X_test, y_test):
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(random_state=42, n_jobs=-1)
    }
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {'MSE': mse, 'R2': r2}
    return results, models['RandomForestRegressor']  # Return RF model for explanations

# Function for A4: Perform sequential feature selection
# Returns reduced X_train, reduced X_test, selected feature mask
def perform_sequential_feature_selection(X_train_pre, y_train, X_test_pre, direction='forward'):
    estimator = LinearRegression()  # Use LR for selector to avoid slowness with RF
    sfs = SequentialFeatureSelector(estimator, n_features_to_select='auto', direction=direction, scoring='r2', n_jobs=-1)
    sfs.fit(X_train_pre, y_train)
    X_train_sfs = sfs.transform(X_train_pre)
    X_test_sfs = sfs.transform(X_test_pre)
    return X_train_sfs, X_test_sfs, sfs.support_

# Function to get LIME explainer and explanation for a specific instance
# Returns the LIME explanation object
def get_lime_explanation(model, X_train_pre, X_test_pre, feature_names, instance_idx=0):
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train_pre,
        feature_names=feature_names,
        mode="regression"
    )
    explanation = explainer.explain_instance(X_test_pre[instance_idx], model.predict)
    return explanation

# Function to get SHAP values
# Returns SHAP values for the test set
def get_shap_values(model, X_test_pre):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_pre)
    return shap_values

# Main program
if __name__ == "__main__":
    # Load and preprocess data
    X_train_pre, X_test_pre, y_train, y_test, preprocessor, feature_names, df_num = load_and_preprocess_data()

    # A1: Feature correlation analysis
    corr_matrix = compute_correlation(df_num)
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Feature Correlation Heatmap")
    plt.show()

    # Baseline model performance (without reduction)
    baseline_results, _ = run_regression_models(X_train_pre, y_train, X_test_pre, y_test)
    print("Baseline Model Performance (No Reduction):")
    for model, metrics in baseline_results.items():
        print(f"{model} - MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.2f}")

    # A2: PCA with 99% variance
    X_train_pca99, X_test_pca99, n_comp99, cum_var99 = perform_pca(X_train_pre, X_test_pre, 0.99)
    print(f"\nA2: PCA with 99% variance retained {n_comp99} components.")
    pca99_results, _ = run_regression_models(X_train_pca99, y_train, X_test_pca99, y_test)
    print("A2 Model Performance (PCA 99%):")
    for model, metrics in pca99_results.items():
        print(f"{model} - MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.2f}")

    # A3: PCA with 95% variance
    X_train_pca95, X_test_pca95, n_comp95, cum_var95 = perform_pca(X_train_pre, X_test_pre, 0.95)
    print(f"\nA3: PCA with 95% variance retained {n_comp95} components.")
    pca95_results, _ = run_regression_models(X_train_pca95, y_train, X_test_pca95, y_test)
    print("A3 Model Performance (PCA 95%):")
    for model, metrics in pca95_results.items():
        print(f"{model} - MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.2f}")

    # A4: Sequential Feature Selection
    X_train_sfs, X_test_sfs, sfs_mask = perform_sequential_feature_selection(X_train_pre, y_train, X_test_pre)
    num_selected = sum(sfs_mask)
    print(f"\nA4: Sequential Feature Selection retained {num_selected} features.")
    sfs_results, _ = run_regression_models(X_train_sfs, y_train, X_test_sfs, y_test)
    print("A4 Model Performance (SFS):")
    for model, metrics in sfs_results.items():
        print(f"{model} - MSE: {metrics['MSE']:.2f}, R2: {metrics['R2']:.2f}")

    # Comparison
    print("\nComparison of Results:")
    print("Baseline vs PCA99 vs PCA95 vs SFS")
    for model in baseline_results.keys():
        print(f"{model}:")
        print(f"  Baseline - MSE: {baseline_results[model]['MSE']:.2f}, R2: {baseline_results[model]['R2']:.2f}")
        print(f"  PCA99 - MSE: {pca99_results[model]['MSE']:.2f}, R2: {pca99_results[model]['R2']:.2f}")
        print(f"  PCA95 - MSE: {pca95_results[model]['MSE']:.2f}, R2: {pca95_results[model]['R2']:.2f}")
        print(f"  SFS - MSE: {sfs_results[model]['MSE']:.2f}, R2: {sfs_results[model]['R2']:.2f}")

    # A5: LIME and SHAP explanations (using RandomForest from baseline)
    _, rf_model = run_regression_models(X_train_pre, y_train, X_test_pre, y_test)  # Refit for explanations

    # LIME
    print("\nA5: LIME Explanation for first test instance")
    lime_exp = get_lime_explanation(rf_model, X_train_pre, X_test_pre, feature_names, instance_idx=0)
    lime_exp.show_in_notebook(show_table=True)

    # SHAP
    print("A5: SHAP Summary Plot")
    shap_values = get_shap_values(rf_model, X_test_pre)
    shap.summary_plot(shap_values, X_test_pre, feature_names=feature_names, show=True)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.py) ... [?25l[?25hdone


FileNotFoundError: [Errno 2] No such file or directory: 'listings.csv'