# Customer interest classification (SHAP, SMOTE, GridSearchCV)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_selection import RFECV
from sklearn.multiclass import OneVsRestClassifier

import shap
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("ICM520_2024_2025_Report_PartA_DataSet_2000Rows.csv")
df.head()

In [None]:
df['FEAT_1'] = LabelEncoder().fit_transform(df['FEAT_1'])
df['FEAT_15'] = LabelEncoder().fit_transform(df['FEAT_15'])
df['FEAT_9'].fillna(df['FEAT_9'].median(), inplace=True)

X = df.drop('CATEGORY', axis=1)
y = df['CATEGORY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
pd.Series(y_train_res).value_counts()

In [None]:
pipeline = Pipeline([
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5],
    'rf__max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='recall_macro', cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)

best_rf = grid_search.best_estimator_
print("Best Params:", grid_search.best_params_)

In [None]:
y_pred_rf = best_rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Final working SHAP block
rf_model = best_rf.named_steps['rf']
X_test_array = X_test.values
feature_names = X_test.columns.tolist()

explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_array)

# Check shapes
print("Expected shape:", X_test_array.shape)
print("SHAP shape (class 0):", shap_values[0].shape)

# Fix transposed SHAP shape if needed
shap_vals_class0 = shap_values[0]
if shap_vals_class0.shape[1] != len(feature_names):
    print("Fixing SHAP transpose...")
    shap_vals_class0 = shap_vals_class0.T

# Final plot
shap.summary_plot(shap_vals_class0, features=X_test_array, feature_names=feature_names, plot_type="bar")