<a href="https://colab.research.google.com/github/2303A52268/Explainable-AI-2268/blob/main/EAI_Ass_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Explainable AI Assignment-02

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import sklearn

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report
)

# If LightGBM isn't available, fall back to RandomForest
try:
    from lightgbm import LGBMClassifier
    USE_LGBM = True
except Exception:
    from sklearn.ensemble import RandomForestClassifier
    USE_LGBM = False

In [None]:
# 1) Load Dataset
# -----------------------------
CSV_PATH = "/content/fifa_players.csv"  # put the CSV beside this script/notebook
df = pd.read_csv(CSV_PATH)
print("Columns in dataset:", df.columns.tolist())

# Basic cleaning
df = df.drop_duplicates()

Columns in dataset: ['name', 'full_name', 'birth_date', 'age', 'height_cm', 'weight_kgs', 'positions', 'nationality', 'overall_rating', 'potential', 'value_euro', 'wage_euro', 'preferred_foot', 'international_reputation(1-5)', 'weak_foot(1-5)', 'skill_moves(1-5)', 'body_type', 'release_clause_euro', 'national_team', 'national_rating', 'national_team_position', 'national_jersey_number', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'freekick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle', 'sliding_tackle']


In [None]:
# 2) Pick Target Column (flexible)
# -----------------------------
# Prefer well-known rating/value columns if present
preferred_targets = [
    "overall_rating", "overall", "national_rating",
    "value_euro", "wage_euro", "potential"
]

target_col = None
for c in preferred_targets:
    if c in df.columns:
        target_col = c
        break

# Fallback: last numeric column
if target_col is None:
    numeric_cols_all = df.select_dtypes(include=[np.number]).columns.tolist()
    assert len(numeric_cols_all) > 0, "No numeric columns found to use as a target."
    target_col = numeric_cols_all[-1]

print("Using target column:", target_col)

# Binary target via median split (High vs Low)
df = df.dropna(subset=[target_col]).copy()
threshold = df[target_col].median()
df["Target"] = (df[target_col] >= threshold).astype(int)

# Features/labels
X = df.drop(columns=["Target", target_col])
y = df["Target"]


Using target column: overall_rating


In [None]:
# 3) Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# 4) Preprocessing
# -----------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Version-safe OneHotEncoder: older sklearn uses "sparse", newer has "sparse_output"
try:
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])
except TypeError:
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# 5) Model (fast defaults)
# -----------------------------
if USE_LGBM:
    model = LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    )
else:
    # Fallback: still good & fast
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=12,
        n_jobs=-1,
        random_state=42
    )

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [None]:
#6) Train
# -----------------------------
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 6867, number of negative: 5700
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3473
[LightGBM] [Info] Number of data points in the train set: 12567, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.546431 -> initscore=0.186261
[LightGBM] [Info] Start training from score 0.186261


In [None]:
# 7) Evaluation
# -----------------------------
y_pred = clf.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Evaluation:
Accuracy: 0.9922034527566364
Precision: 0.9948840381991815
Recall: 0.990828804347826
F1-score: 0.9928522804629
ROC AUC: 0.992344406267241

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2443
           1       0.99      0.99      0.99      2944

    accuracy                           0.99      5387
   macro avg       0.99      0.99      0.99      5387
weighted avg       0.99      0.99      0.99      5387



In [None]:
# 8) Feature Names after Preprocessing
# -----------------------------
# Fit preprocessor alone (already fitted inside pipeline, but we need the encoder to pull names)
pre = clf.named_steps["preprocessor"]
# Access the fitted OneHotEncoder inside ColumnTransformer
ohe = pre.named_transformers_["cat"].named_steps["onehot"] if len(categorical_features) else None
if ohe is not None and hasattr(ohe, "get_feature_names_out"):
    cat_out = ohe.get_feature_names_out(categorical_features)
else:
    cat_out = np.array([])

feature_names_transformed = list(numeric_features) + list(cat_out)

In [None]:
# 9) Transform Test Set (for SHAP)
# -----------------------------
# Will be a sparse matrix if OHE is sparse; convert a small sample to dense for SHAP plots
X_test_trans = pre.transform(X_test)  # sparse or dense
# Sample up to 300 rows for speed
sample_size = min(300, X_test_trans.shape[0])
if hasattr(X_test_trans, "toarray"):
    X_sample = X_test_trans[:sample_size].toarray()
else:
    X_sample = np.asarray(X_test_trans[:sample_size])

In [None]:
# 10) SHAP Explainer (TreeExplainer)
# -----------------------------
# For LightGBM: use native TreeExplainer; for RF fallback it's also supported
est = clf.named_steps["model"]
explainer = shap.TreeExplainer(est, feature_perturbation="tree_path_dependent")

# LightGBM binary returns a list [class0, class1]; RandomForest often returns numpy for binary, but handle both
shap_values = explainer.shap_values(X_sample)

if isinstance(shap_values, list):
    # Take positive class contributions
    shap_vals_pos = shap_values[1]
else:
    shap_vals_pos = shap_values

# Align shapes just in case (#features can differ if encoder pruned unseen categories)
min_len = min(shap_vals_pos.shape[1], len(feature_names_transformed))
shap_vals_pos = shap_vals_pos[:, :min_len]
feature_names_aligned = feature_names_transformed[:min_len]
X_sample_aligned = X_sample[:, :min_len]

In [None]:
# 11) Save Plots (Summary Beeswarm + Bar)
# -----------------------------
outdir = Path("outputs")
outdir.mkdir(exist_ok=True)

plt.figure()
shap.summary_plot(shap_vals_pos, X_sample_aligned,
                  feature_names=feature_names_aligned, show=False)
plt.title("SHAP Summary (Beeswarm) – Sampled")
plt.tight_layout()
plt.savefig(outdir / "shap_summary_beeswarm.png", bbox_inches="tight")
plt.close()

plt.figure()
shap.summary_plot(shap_vals_pos, X_sample_aligned,
                  feature_names=feature_names_aligned, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (Bar) – Sampled")
plt.tight_layout()
plt.savefig(outdir / "shap_summary_bar.png", bbox_inches="tight")
plt.close()

In [None]:
# 12) Optional: Force & Waterfall for one instance (fast-safe)
# -----------------------------
# Use legacy waterfall to avoid Explanation API mismatches across SHAP versions
idx = 0
base_value = explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value

plt.figure()
shap.plots._waterfall.waterfall_legacy(
    base_value, shap_vals_pos[idx], feature_names=feature_names_aligned, show=False, max_display=20
)
plt.title("SHAP Waterfall – One Prediction")
plt.tight_layout()
plt.savefig(outdir / "shap_waterfall.png", bbox_inches="tight")
plt.close()

plt.figure()
shap.force_plot(base_value, shap_vals_pos[idx, :], X_sample_aligned[idx, :],
                feature_names=feature_names_aligned, matplotlib=True, show=False)
plt.title("SHAP Force Plot – One Prediction")
plt.tight_layout()
plt.savefig(outdir / "shap_force.png", bbox_inches="tight")
plt.close()

<Figure size 640x480 with 0 Axes>

In [None]:
# 13) (Optional) Model Feature Importance Table
# -----------------------------
try:
    importances = est.feature_importances_
    fi = (pd.DataFrame({
        "feature": feature_names_aligned,
        "importance": importances[:len(feature_names_aligned)]
    })
    .sort_values("importance", ascending=False)
    .head(20))
    fi.to_csv(outdir / "/content/fifa_players.csv", index=False)
    print("\nTop features (model importance):")
    print(fi.head(10))
except Exception:
    pass

print("\n✅ Done. Plots saved to ./outputs :")
print(" - shap_summary_beeswarm.png")
print(" - shap_summary_bar.png")
print(" - shap_waterfall.png")
print(" - shap_force.png")
print("And (optional) model_feature_importance_top20.csv")


Top features (model importance):
           feature  importance
4       value_euro        1112
3        potential         782
0              age         697
12        crossing         180
17       dribbling         170
40  sliding_tackle         169
33   interceptions         159
25       reactions         153
34     positioning         141
29         stamina         129

✅ Done. Plots saved to ./outputs :
 - shap_summary_beeswarm.png
 - shap_summary_bar.png
 - shap_waterfall.png
 - shap_force.png
And (optional) model_feature_importance_top20.csv
