In [150]:
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib

In [151]:
# Load dataset and model
df = pd.read_csv("/Users/ethanshiu/Documents/GitHub/25-spring-ESHIU/Code/ui_data.csv")  # this includes 'success' column if needed
model = joblib.load("/Users/ethanshiu/Documents/GitHub/25-spring-ESHIU/Code/rf_shap_tuned.pkl")
X = df.drop(columns=["success", "year"])
print(X.sample(5))

     div_class  log_rel_part  violent_flank  camp_backlash     total_part  \
17         1.0      0.002760            2.0            3.0   20831.000000   
146        0.0      0.000571            0.0            3.0    4850.166667   
369        0.0      0.000036            0.0            3.0     625.000000   
161        1.0      0.001911            0.0            3.0   10632.142857   
4          0.0      0.015696            2.0            3.0  171667.000000   

     wdrwl_support  camp_duration  fatalities_range  indiscrim  sec_defect  \
17             0.0              2               4.0        0.0         1.0   
146            0.0              6               4.0        1.0         0.0   
369            0.0              2               4.0        1.0         0.0   
161            0.0             14               5.0        1.0         1.0   
4              0.0              3               2.0        1.0         0.0   

     state_defect  div_ethnicity  sdirect  ab_internat  camp_support

In [152]:
# Load SHAP values (optional, only if precomputed)
shap_values = joblib.load("/Users/ethanshiu/Documents/GitHub/25-spring-ESHIU/Code/shap_values_class1.pkl")
print(shap_values)

[[-0.04917715  0.04917715]
 [-0.06071993  0.06071993]
 [ 0.1041814  -0.1041814 ]
 [-0.02679149  0.02679149]
 [ 0.05689985 -0.05689985]
 [ 0.04102581 -0.04102581]
 [ 0.00963247 -0.00963247]
 [ 0.00376781 -0.00376781]
 [ 0.02459498 -0.02459498]
 [ 0.0337152  -0.0337152 ]
 [-0.00892117  0.00892117]
 [ 0.01980314 -0.01980314]
 [ 0.00971945 -0.00971945]
 [-0.00491607  0.00491607]
 [ 0.0202414  -0.0202414 ]
 [ 0.00291516 -0.00291516]
 [-0.0051826   0.0051826 ]
 [ 0.0012739  -0.0012739 ]
 [ 0.01653507 -0.01653507]
 [-0.00301893  0.00301893]]


In [153]:
# Function to predict probability
def predict_success_probability(user_input_dict):
    input_df = pd.DataFrame([user_input_dict], columns=X.columns)
    prob = model.predict_proba(input_df)[0, 1]
    return float(prob)

In [154]:
# Function to plot SHAP dependence
def shap_dependence_plot(main_feature, interaction_feature=None):
    # Ensure we're only using the features the model saw
    X_features_only = X.drop(columns=["year", "Success"])

    # Create TreeExplainer and get SHAP values
    explainer = shap.TreeExplainer(rf_shap_tuned)
    shap_values = explainer.shap_values(X_features_only)

    # Get SHAP values for class 1 (usually success = 1)
    shap_values_class1 = shap_values[1]

    # Plot dependence
    shap.dependence_plot(
        main_feature,
        shap_values_class1,
        X_features_only,
        interaction_index=interaction_feature,
        show=False
    )
    plt.tight_layout()
    plt.show()


In [155]:
sample_input = {
    'wdrwl_support': 1.250656e+00,
    'div_class': 1.034801e+00,
    'sec_defect': 7.589216e-01,
    'sdirect': 5.781821e-01,
    'div_ethnicity': 5.513224e-01,
    'camp_backlash': 4.608532e-01,
    'state_defect': 3.873066e-01,
    'camp_support': 2.261295e-01,
    'violent_flank': 2.094405e-01,
    'camp_goals': 1.306134e-01,
    'ab_internat': 6.398306e-02,
    'log_rel_part': 7.229054e-03,
    'media_outreach': 1.450923e-03,
    'total_part': 2.062547e-08,
    'camp_orgs': -3.541489e-02,
    'dom_media': -6.930089e-02,
    'camp_duration': -9.874490e-02,
    'camp_structure': -2.694264e-01,
    'fatalities_range': -2.925996e-01,
    'indiscrim': -8.209170e-01,
}

# Create DataFrame and match columns to model
sample_df = pd.DataFrame([sample_input])
sample_df = sample_df[model.feature_names_in_]

# Predict
prob = model.predict_proba(sample_df)[0, 1]
print(f"Predicted Probability of Success: {prob:.2%}")


Predicted Probability of Success: 77.69%


In [156]:
predict_success_probability(sample_input)

0.7768809523809524

In [157]:
print("shap_values shape:", shap_values[1].shape)
print(shap_values)
print("X shape:", X.shape)


shap_values shape: (2,)
[[-0.04917715  0.04917715]
 [-0.06071993  0.06071993]
 [ 0.1041814  -0.1041814 ]
 [-0.02679149  0.02679149]
 [ 0.05689985 -0.05689985]
 [ 0.04102581 -0.04102581]
 [ 0.00963247 -0.00963247]
 [ 0.00376781 -0.00376781]
 [ 0.02459498 -0.02459498]
 [ 0.0337152  -0.0337152 ]
 [-0.00892117  0.00892117]
 [ 0.01980314 -0.01980314]
 [ 0.00971945 -0.00971945]
 [-0.00491607  0.00491607]
 [ 0.0202414  -0.0202414 ]
 [ 0.00291516 -0.00291516]
 [-0.0051826   0.0051826 ]
 [ 0.0012739  -0.0012739 ]
 [ 0.01653507 -0.01653507]
 [-0.00301893  0.00301893]]
X shape: (380, 20)


In [159]:
shap_dependence_plot("div_class")
shap_dependence_plot("div_class", interaction_feature="state_defect")


KeyError: "['year', 'Success'] not found in axis"