In [2]:
pip install streamlit plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib
import json
from datetime import datetime

# Page configuration
st.set_page_config(
    page_title="Loan Default Prediction Dashboard",
    page_icon="💰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
    <style>
    .main-header {
        font-size: 3rem;
        font-weight: bold;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-card {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        border-left: 4px solid #1f77b4;
    }
    .prediction-box {
        padding: 2rem;
        border-radius: 1rem;
        text-align: center;
        font-size: 1.5rem;
        font-weight: bold;
    }
    .safe-loan {
        background-color: #d4edda;
        color: #155724;
        border: 2px solid #c3e6cb;
    }
    .risky-loan {
        background-color: #f8d7da;
        color: #721c24;
        border: 2px solid #f5c6cb;
    }
    </style>
""", unsafe_allow_html=True)

# Load model and data
@st.cache_resource
def load_model():
    try:
        model = joblib.load('../models/xgboost_loan_default_model.pkl')
        return model
    except:
        return None

@st.cache_data
def load_performance_data():
    try:
        with open('../reports/model_performance_summary.json', 'r') as f:
            return json.load(f)
    except:
        return None

@st.cache_data
def load_sample_data():
    try:
        df = pd.read_csv('../data/processed/loans_feature_engineered.csv')
        return df.head(1000)  # Load sample for faster dashboard
    except:
        return None

# Initialize
model = load_model()
performance = load_performance_data()
sample_data = load_sample_data()

# Header
st.markdown('<h1 class="main-header">💰 Loan Default Prediction System</h1>', unsafe_allow_html=True)
st.markdown("---")

# Sidebar navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", [
    "🏠 Overview",
    "🎯 Model Performance",
    "📊 Data Insights",
    "🔮 Make Prediction",
    "💼 Business Impact"
])

# =====================================================
# PAGE 1: OVERVIEW
# =====================================================
if page == "🏠 Overview":
    st.header("Project Overview")
    
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric(
            label="Total Loans Analyzed",
            value="1.35M",
            delta="100%"
        )
    
    with col2:
        st.metric(
            label="Model AUC-ROC",
            value="1.0000",
            delta="Perfect Score"
        )
    
    with col3:
        st.metric(
            label="Annual Savings",
            value="$1.63B",
            delta="+99.96%"
        )
    
    with col4:
        st.metric(
            label="Default Rate",
            value="19.98%",
            delta="Industry Avg"
        )
    
    st.markdown("---")
    
    # Project description
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("📋 Project Summary")
        st.write("""
        This machine learning system predicts loan default risk with near-perfect accuracy.
        Built on 1.35 million historical loan records, it helps financial institutions:
        
        - **Identify high-risk loans** before approval
        - **Reduce default losses** by 99.96%
        - **Make data-driven** lending decisions
        - **Save $1.63 billion** annually
        """)
        
        st.subheader("🔧 Technical Stack")
        st.write("""
        - **Algorithm:** XGBoost Classifier
        - **Features:** 116 engineered features
        - **Training Data:** 1,078,479 loans
        - **Test Data:** 269,620 loans
        - **Class Balance:** 4:1 (Paid:Default)
        """)
    
    with col2:
        st.subheader("🎯 Key Achievements")
        st.write("""
        **Model Performance:**
        - ✅ 100% Precision (no false alarms)
        - ✅ 99.96% Recall (catches almost all defaults)
        - ✅ 1.0 AUC-ROC (perfect discrimination)
        - ✅ Only 19 defaults missed out of 53,872
        
        **Business Impact:**
        - 💰 $325M baseline losses → $115K with model
        - 📉 99.96% cost reduction
        - 📈 Zero good loans rejected (no revenue loss)
        - 🚀 Scalable to millions of predictions
        """)
    
    st.markdown("---")
    
    # Model comparison chart
    st.subheader("📊 Model Comparison")
    
    model_comparison = pd.DataFrame({
        'Model': ['Logistic Regression', 'XGBoost', 'Random Forest'],
        'AUC-ROC': [0.9826, 1.0000, 0.9999]
    })
    
    fig = px.bar(
        model_comparison,
        x='Model',
        y='AUC-ROC',
        title='Model Performance Comparison',
        color='AUC-ROC',
        color_continuous_scale='Blues',
        text='AUC-ROC'
    )
    fig.update_traces(texttemplate='%{text:.4f}', textposition='outside')
    fig.update_layout(height=400, showlegend=False)
    st.plotly_chart(fig, use_container_width=True)

# =====================================================
# PAGE 2: MODEL PERFORMANCE
# =====================================================
elif page == "🎯 Model Performance":
    st.header("Model Performance Analytics")
    
    if performance:
        # Key metrics
        col1, col2, col3, col4 = st.columns(4)
        
        with col1:
            st.metric("AUC-ROC", f"{performance['auc_roc']:.4f}")
        with col2:
            st.metric("Precision", f"{performance['precision']:.4f}")
        with col3:
            st.metric("Recall", f"{performance['recall']:.4f}")
        with col4:
            st.metric("F1-Score", f"{performance['f1_score']:.4f}")
        
        st.markdown("---")
        
        # Confusion Matrix
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Confusion Matrix")
            
            # Create confusion matrix visualization
            cm_data = [
                [performance['true_negatives'], performance['false_positives']],
                [performance['false_negatives'], performance['true_positives']]
            ]
            
            fig = go.Figure(data=go.Heatmap(
                z=cm_data,
                x=['Predicted Paid', 'Predicted Default'],
                y=['Actual Paid', 'Actual Default'],
                colorscale='Blues',
                text=cm_data,
                texttemplate='%{text:,}',
                textfont={"size": 16},
                showscale=False
            ))
            
            fig.update_layout(
                title='Confusion Matrix',
                height=400,
                xaxis_title='Predicted',
                yaxis_title='Actual'
            )
            
            st.plotly_chart(fig, use_container_width=True)
            
            # Breakdown
            st.write("**Breakdown:**")
            st.write(f"- ✅ True Negatives: {performance['true_negatives']:,}")
            st.write(f"- ✅ True Positives: {performance['true_positives']:,}")
            st.write(f"- ❌ False Positives: {performance['false_positives']:,}")
            st.write(f"- ❌ False Negatives: {performance['false_negatives']:,}")
        
        with col2:
            st.subheader("Performance Metrics")
            
            # Create metrics comparison
            metrics_df = pd.DataFrame({
                'Metric': ['Precision', 'Recall', 'F1-Score', 'Accuracy'],
                'Score': [
                    performance['precision'],
                    performance['recall'],
                    performance['f1_score'],
                    (performance['true_negatives'] + performance['true_positives']) / performance['test_set_size']
                ]
            })
            
            fig = go.Figure(go.Bar(
                x=metrics_df['Score'],
                y=metrics_df['Metric'],
                orientation='h',
                text=metrics_df['Score'].apply(lambda x: f'{x:.4f}'),
                textposition='auto',
                marker_color='lightblue'
            ))
            
            fig.update_layout(
                title='Classification Metrics',
                xaxis_title='Score',
                height=400,
                xaxis_range=[0.99, 1.001]
            )
            
            st.plotly_chart(fig, use_container_width=True)
            
            st.write("**Interpretation:**")
            st.write("- **Precision:** 100% of predicted defaults were correct")
            st.write("- **Recall:** 99.96% of actual defaults were caught")
            st.write("- **F1-Score:** Perfect balance between precision and recall")
    else:
        st.warning("Performance data not available")

# =====================================================
# PAGE 3: DATA INSIGHTS
# =====================================================
elif page == "📊 Data Insights":
    st.header("Data Insights & Patterns")
    
    if sample_data is not None:
        # Dataset overview
        st.subheader("Dataset Overview")
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.metric("Total Records", f"{len(sample_data):,}")
        with col2:
            st.metric("Features", sample_data.shape[1] - 1)
        with col3:
            default_rate = sample_data['is_default'].mean() * 100
            st.metric("Default Rate", f"{default_rate:.2f}%")
        
        st.markdown("---")
        
        # Default distribution
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Default Distribution")
            default_counts = sample_data['is_default'].value_counts()
            
            fig = go.Figure(data=[go.Pie(
                labels=['Paid', 'Default'],
                values=default_counts.values,
                hole=.4,
                marker_colors=['#2ecc71', '#e74c3c']
            )])
            
            fig.update_layout(
                title='Loan Outcomes',
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            st.subheader("Feature Correlations")
            
            # Get numeric columns
            numeric_cols = sample_data.select_dtypes(include=[np.number]).columns[:10]
            corr_with_target = sample_data[numeric_cols].corrwith(sample_data['is_default']).sort_values(ascending=False)
            
            fig = go.Figure(go.Bar(
                x=corr_with_target.values,
                y=corr_with_target.index,
                orientation='h',
                marker_color=np.where(corr_with_target.values > 0, '#e74c3c', '#2ecc71')
            ))
            
            fig.update_layout(
                title='Top Feature Correlations with Default',
                xaxis_title='Correlation',
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
        
        st.markdown("---")
        
        # Interactive data explorer
        st.subheader("📈 Interactive Data Explorer")
        
        col1, col2 = st.columns(2)
        
        with col1:
            numeric_features = sample_data.select_dtypes(include=[np.number]).columns.tolist()
            numeric_features.remove('is_default')
            selected_feature = st.selectbox("Select feature to analyze:", numeric_features)
        
        with col2:
            chart_type = st.radio("Chart type:", ["Histogram", "Box Plot", "Violin Plot"])
        
        if selected_feature:
            if chart_type == "Histogram":
                fig = px.histogram(
                    sample_data,
                    x=selected_feature,
                    color='is_default',
                    marginal="box",
                    title=f'Distribution of {selected_feature} by Default Status',
                    labels={'is_default': 'Default Status'}
                )
            elif chart_type == "Box Plot":
                fig = px.box(
                    sample_data,
                    x='is_default',
                    y=selected_feature,
                    color='is_default',
                    title=f'{selected_feature} by Default Status'
                )
            else:
                fig = px.violin(
                    sample_data,
                    x='is_default',
                    y=selected_feature,
                    color='is_default',
                    box=True,
                    title=f'{selected_feature} by Default Status'
                )
            
            fig.update_layout(height=500)
            st.plotly_chart(fig, use_container_width=True)
    else:
        st.warning("Sample data not available")

# =====================================================
# PAGE 4: MAKE PREDICTION
# =====================================================
elif page == "🔮 Make Prediction":
    st.header("Loan Default Prediction Tool")
    
    if model is not None:
        st.write("Enter loan details to get a default risk prediction:")
        
        col1, col2, col3 = st.columns(3)
        
        with col1:
            loan_amnt = st.number_input("Loan Amount ($)", min_value=500, max_value=40000, value=10000, step=500)
            int_rate = st.slider("Interest Rate (%)", min_value=5.0, max_value=30.0, value=12.0, step=0.5)
            installment = st.number_input("Monthly Installment ($)", min_value=0, max_value=2000, value=300)
            annual_inc = st.number_input("Annual Income ($)", min_value=10000, max_value=500000, value=60000, step=5000)
        
        with col2:
            dti = st.slider("Debt-to-Income Ratio (%)", min_value=0.0, max_value=50.0, value=15.0, step=1.0)
            open_acc = st.number_input("Open Accounts", min_value=0, max_value=50, value=10)
            total_acc = st.number_input("Total Accounts", min_value=0, max_value=100, value=20)
            revol_util = st.slider("Revolving Utilization (%)", min_value=0.0, max_value=100.0, value=50.0, step=5.0)
        
        with col3:
            pub_rec = st.number_input("Public Records", min_value=0, max_value=10, value=0)
            delinq_2yrs = st.number_input("Delinquencies (2 years)", min_value=0, max_value=10, value=0)
            inq_last_6mths = st.number_input("Inquiries (6 months)", min_value=0, max_value=10, value=1)
            mort_acc = st.number_input("Mortgage Accounts", min_value=0, max_value=20, value=1)
        
        st.markdown("---")
        
        if st.button("🎯 Predict Default Risk", type="primary"):
            # Create feature array (simplified - would need all 116 features in production)
            # This is a demonstration - in production, you'd need all features
            st.info("⚠️ Note: This is a simplified prediction demo. Full production system requires all 116 features.")
            
            # Simulate prediction (random for demo since we don't have all features)
            prediction_proba = np.random.random()
            
            if prediction_proba < 0.5:
                prediction = 0
                risk_level = "LOW RISK ✅"
                color_class = "safe-loan"
                recommendation = "**Recommendation:** APPROVE LOAN"
                details = "This borrower shows strong indicators of repayment ability."
            else:
                prediction = 1
                risk_level = "HIGH RISK ⚠️"
                color_class = "risky-loan"
                recommendation = "**Recommendation:** REVIEW OR REJECT"
                details = "This borrower shows elevated risk factors. Additional review recommended."
            
            # Display prediction
            st.markdown(f'<div class="prediction-box {color_class}">{risk_level}</div>', unsafe_allow_html=True)
            st.markdown(f"**Default Probability:** {prediction_proba:.2%}")
            
            # Progress bar
            st.progress(prediction_proba)
            
            st.markdown("---")
            
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Risk Assessment")
                st.write(recommendation)
                st.write(details)
                
                # Risk factors
                st.write("**Key Risk Factors:**")
                if dti > 30:
                    st.write("- ⚠️ High DTI ratio")
                if int_rate > 15:
                    st.write("- ⚠️ High interest rate")
                if revol_util > 70:
                    st.write("- ⚠️ High credit utilization")
                if delinq_2yrs > 0:
                    st.write("- ⚠️ Recent delinquencies")
            
            with col2:
                st.subheader("Loan Summary")
                st.write(f"**Loan Amount:** ${loan_amnt:,}")
                st.write(f"**Interest Rate:** {int_rate}%")
                st.write(f"**Monthly Payment:** ${installment:,}")
                st.write(f"**Total Repayment:** ${installment * 36:,} (3 years)")
                st.write(f"**Expected Loss:** ${loan_amnt * prediction_proba:,.2f}")
    else:
        st.error("Model not loaded. Please ensure the model file exists in '../models/'")

# =====================================================
# PAGE 5: BUSINESS IMPACT
# =====================================================
elif page == "💼 Business Impact":
    st.header("Business Impact Analysis")
    
    if performance:
        # Financial metrics
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.metric(
                "Annual Savings",
                f"${performance['projected_annual_savings']:,.0f}",
                delta="vs Baseline"
            )
        
        with col2:
            baseline_cost = 325925600  # From your results
            model_cost = 114950
            reduction = ((baseline_cost - model_cost) / baseline_cost) * 100
            st.metric(
                "Cost Reduction",
                f"{reduction:.2f}%",
                delta="Compared to no model"
            )
        
        with col3:
            roi = (performance['projected_annual_savings'] / 100000) * 100  # Assuming $100k model development cost
            st.metric(
                "ROI",
                f"{roi:.0f}x",
                delta="Return on Investment"
            )
        
        st.markdown("---")
        
        # Cost comparison
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Cost Comparison")
            
            cost_data = pd.DataFrame({
                'Strategy': ['Baseline (No Model)', 'With ML Model'],
                'Annual Cost': [325925600, 114950]
            })
            
            fig = go.Figure(data=[
                go.Bar(
                    x=cost_data['Strategy'],
                    y=cost_data['Annual Cost'],
                    text=cost_data['Annual Cost'].apply(lambda x: f'${x:,.0f}'),
                    textposition='auto',
                    marker_color=['#e74c3c', '#2ecc71']
                )
            ])
            
            fig.update_layout(
                title='Annual Cost Comparison',
                yaxis_title='Cost ($)',
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            st.subheader("Savings Over Time")
            
            months = list(range(1, 13))
            monthly_savings = [performance['projected_annual_savings'] / 12 * i for i in months]
            
            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x=months,
                y=monthly_savings,
                mode='lines+markers',
                fill='tozeroy',
                line=dict(color='#2ecc71', width=3)
            ))
            
            fig.update_layout(
                title='Cumulative Savings by Month',
                xaxis_title='Month',
                yaxis_title='Cumulative Savings ($)',
                height=400
            )
            
            st.plotly_chart(fig, use_container_width=True)
        
        st.markdown("---")
        
        # Impact breakdown
        st.subheader("📊 Impact Breakdown")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.write("**Cost Avoidance:**")
            st.write(f"- Prevented defaults: {performance['true_positives']:,} loans")
            st.write(f"- Average loss per default: $6,050")
            st.write(f"- Total losses prevented: ${performance['true_positives'] * 6050:,.0f}")
            
            st.write("\n**Model Errors:**")
            st.write(f"- False negatives: {performance['false_negatives']} loans")
            st.write(f"- Cost of errors: ${performance['false_negatives'] * 6050:,.0f}")
            st.write(f"- False positives: {performance['false_positives']} (no cost)")
        
        with col2:
            st.write("**Operational Benefits:**")
            st.write("- ✅ Automated risk assessment")
            st.write("- ✅ Faster loan decisions")
            st.write("- ✅ Reduced manual review workload")
            st.write("- ✅ Consistent decision-making")
            st.write("- ✅ Scalable to millions of applications")
            
            st.write("\n**Strategic Advantages:**")
            st.write("- 📈 Improved portfolio quality")
            st.write("- 💰 Better capital allocation")
            st.write("- 🎯 Targeted marketing to low-risk customers")
            st.write("- 📊 Data-driven lending strategy")
    else:
        st.warning("Performance data not available")

# Footer
st.markdown("---")
st.markdown("""
    <div style='text-align: center; color: #666;'>
        <p>Loan Default Prediction System | Built with Streamlit & XGBoost | © 2025</p>
    </div>
""", unsafe_allow_html=True)

2025-10-27 14:02:13.569 
  command:

    streamlit run /Applications/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-10-27 14:02:13.570 No runtime found, using MemoryCacheStorageManager
2025-10-27 14:02:13.570 No runtime found, using MemoryCacheStorageManager
2025-10-27 14:02:14.392 No runtime found, using MemoryCacheStorageManager
2025-10-27 14:02:14.393 No runtime found, using MemoryCacheStorageManager
2025-10-27 14:02:17.335 Session state does not function when running a script without `streamlit run`


DeltaGenerator()