In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import shap


# Load dataset
df = pd.read_csv('synthetic_user_risk_data.csv')
df['risk_tier_code'] = df['risk_tier'].map({'Low': 0, 'Medium': 1, 'High': 2})

X = df.drop(['user_id', 'risk_tier', 'risk_tier_code'], axis=1)
y = df['risk_tier_code']

 #3. Visualize Feature Distributions
# -------------------------------

plt.figure(figsize=(12, 6))
sns.histplot(df['avg_txn_amount'], bins=50, kde=True)
plt.title("Distribution of Average Transaction Amount")
plt.show()

plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='risk_tier', order=['Low', 'Medium', 'High'], palette="viridis")
plt.title("User Count per Risk Tier")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.drop(['user_id'], axis=1).corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# -------------------------------
# 4. Feature Engineering
# -------------------------------

df['risk_tier_code'] = df['risk_tier'].map({'Low': 0, 'Medium': 1, 'High': 2})

X = df.drop(['user_id', 'risk_tier', 'risk_tier_code'], axis=1)
y = df['risk_tier_code']





# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# SHAP analysis
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
