# Download Dataset

In [None]:
!gdown --id 1C5Z4X6T-6j-OIUyifd0WEal_kSTLheGF

In [None]:
!pip install shap lime

# Load Dataset and Feature Engineering

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import shap

# Load the data
data = pd.read_csv('/content/USA_Housing.csv')

# Separate features and target
X = data.drop(columns=['Price', 'Address'])  # Assuming 'Address' is not a numeric feature
y = data['Price']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor with specified parameters
xgb_regressor = XGBRegressor(
    max_depth=3,
    learning_rate=0.01,
    subsample=0.5,
    n_estimators=1000,
    verbosity=0
)

# Fit the model
xgb_regressor.fit(X_train, y_train)

# Compute SHAP values
explainer = shap.Explainer(xgb_regressor, X_train)
shap_values = explainer(X_test)

# Visualizations
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X.columns)  # Summary bar plot
shap.summary_plot(shap_values, X_test, plot_type="dot", feature_names=X.columns)  # Summary bee-swarm plot



In [None]:
data.info()

In [None]:
import numpy as np
# Calculate the mean of the absolute SHAP values for each feature
mean_shap_values = np.abs(shap_values.values).mean(axis=0)
top_feature_index = np.argmax(mean_shap_values)
top_feature = X.columns[top_feature_index]

# Dependence plot for the most important feature
shap.dependence_plot(top_feature_index, shap_values.values, X_test, feature_names=X.columns)

# Assuming X_test is still a DataFrame. If it's not, make sure to convert or keep it as a DataFrame after scaling.
shap.initjs()
# Force plot for a single prediction
shap.force_plot(explainer.expected_value, shap_values.values[0,:], X_test[0,:], feature_names=X.columns)




# Visualize a single prediction

In [None]:
e3 = shap.TreeExplainer(xgb_regressor, X_train)
t = e3.shap_values(X_test)


In [None]:
shap.initjs()
shap.force_plot(e3.expected_value, shap_values.values[0:500,:], X_test[0:500,:], feature_names=X.columns)

# LIME

In [None]:
import lime
import lime.lime_tabular
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the data
data = pd.read_csv('/content/USA_Housing.csv')

# Separate features and target
X = data.drop(columns=['Price', 'Address'])
y = data['Price']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)  # Convert scaled data back to DataFrame

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = XGBRegressor(
    max_depth=1,
    learning_rate=0.01,
    subsample=0.5,
    n_estimators=1000,
    verbosity=0
)
model.fit(X_train, y_train)

# Initialize LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.to_numpy(),
    feature_names=X_train.columns,
    class_names=['Price'],
    mode='regression'
)

# Choose an instance to explain
instance_index = 10  # Change based on the specific prediction you want to explain
instance = X_test.iloc[instance_index].to_numpy()

# Generate explanation
explanation = explainer.explain_instance(
    data_row=instance,
    predict_fn=model.predict,
    num_features=5
)

# Show the explanation
explanation.show_in_notebook(show_table=True)
