# FIN42100 Machine Learning in Finance: Predicting Loan Default

## Group No : 4

**Group Members**:
<br>
Tejaswi Patil - 24209396 <br>
Shagun Chandok - 24289312 <br>
Nilay Singh - 24289944 <br>
Dhruv Singh - 24234646 <br>
Aditya Suhane - 24212188 <br>
  
**Submission Date**: April 18th  



## Question 1: Exploratory Data Analysis and Insights
Objective: Conduct comprehensive data exploration to uncover patterns and insights relevant to predicting loan defaults.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency


df = pd.read_csv("C:/Users/adity/OneDrive/Desktop/clg/UCD/Sem 2/ML For Finance/Project/SBAnational_clean.csv")
df

In [None]:
# Selected columns from your list
# df = df_cleaned
selected_columns = ['NAICS', 'Term', 'NoEmp', 'NewExist', 'CreateJob',
                   'RetainedJob', 'UrbanRural', 'RevLineCr', 'LowDoc',
                   'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr',
                   'GrAppv', 'SBA_Appv', 'DaysToDisbursement', 'Industry',
                   'Minority', 'MIS_Status']

# 1. Check data types
print("Data Types of Selected Columns:")
print("-" * 50)
print(df[selected_columns].dtypes)

# 2. Categorize variables
numerical_continuous = []
numerical_discrete = []
categorical = []

for column in selected_columns:
    # Get unique values
    unique_count = df[column].nunique()
    dtype = df[column].dtype

    # Categorize based on data type and unique values
    if dtype in ['int64', 'float64']:
        if unique_count <= 25:  # Threshold for discrete vs continuous
            numerical_discrete.append(column)
        else:
            numerical_continuous.append(column)
    else:
        categorical.append(column)

# Print categorization
print("\nVariable Categorization:")
print("-" * 50)
print("\nNumerical Continuous Variables:")
for var in numerical_continuous:
    print(f"{var}: {df[var].nunique()} unique values")

print("\nNumerical Discrete Variables:")
for var in numerical_discrete:
    print(f"{var}: {df[var].nunique()} unique values")
    print(f"Unique values: {sorted(df[var].unique())}")

print("\nCategorical Variables:")
for var in categorical:
    print(f"{var}: {df[var].nunique()} unique values")
    if df[var].nunique() < 10:  # Show values only if there are few categories
        print(f"Categories: {sorted(df[var].unique())}")

# 3. Check for missing values
print("\nMissing Values:")
print("-" * 50)
missing_values = df[selected_columns].isnull().sum()
print(missing_values[missing_values > 0])

# 4. Basic statistics for numerical variables
print("\nNumerical Variables Summary:")
print("-" * 50)
print(df[numerical_continuous + numerical_discrete].describe())


In [None]:
import pandas as pd
import numpy as np

# Convert date and filter for 1962-2014
df['ApprovalDate'] = pd.to_datetime(df['ApprovalDate'], format='%m/%d/%Y')

# Create new DataFrame with only 1962-2014 data
df = df[(df['ApprovalDate'].dt.year >= 1962) & (df['ApprovalDate'].dt.year <= 2014)].copy()

# Reset the index after filtering
df = df.reset_index(drop=True)

# Verify the filtering
print("Dataset Information After Filtering:")
print("-" * 50)
print(f"Total number of records: {len(df):,}")
print(f"Date range: {df['ApprovalDate'].dt.year.min()} to {df['ApprovalDate'].dt.year.max()}")
print("\nFirst few records:")
print(df[['ApprovalDate', 'DisbursementGross', 'BalanceGross', 'MIS_Status']].head())

In [None]:
# Selected columns from your list
selected_columns = ['NAICS', 'Term', 'NoEmp', 'NewExist', 'CreateJob',
                   'RetainedJob', 'UrbanRural', 'RevLineCr', 'LowDoc',
                   'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr',
                   'GrAppv', 'SBA_Appv', 'DaysToDisbursement', 'Industry',
                   'Minority', 'MIS_Status']

# 1. Check data types
print("Data Types of Selected Columns:")
print("-" * 50)
print(df[selected_columns].dtypes)

# 2. Categorize variables
numerical_continuous = []
numerical_discrete = []
categorical = []

for column in selected_columns:
    # Get unique values
    unique_count = df[column].nunique()
    dtype = df[column].dtype

    # Categorize based on data type and unique values
    if dtype in ['int64', 'float64']:
        if unique_count < 20:  # Threshold for discrete vs continuous
            numerical_discrete.append(column)
        else:
            numerical_continuous.append(column)
    else:
        categorical.append(column)

# Print categorization
print("\nVariable Categorization:")
print("-" * 50)
print("\nNumerical Continuous Variables:")
for var in numerical_continuous:
    print(f"{var}: {df[var].nunique()} unique values")

print("\nNumerical Discrete Variables:")
for var in numerical_discrete:
    print(f"{var}: {df[var].nunique()} unique values")
    print(f"Unique values: {sorted(df[var].unique())}")

print("\nCategorical Variables:")
for var in categorical:
    print(f"{var}: {df[var].nunique()} unique values")
    if df[var].nunique() < 10:  # Show values only if there are few categories
        print(f"Categories: {sorted(df[var].unique())}")

# Essential Data Quality Checks
print(f"Missing Values in Key Features:")
print(df[['DisbursementGross','Term','UrbanRural']].isnull().mean().round(2))

print("\nCritical Variable Ranges:")
print(df[['DisbursementGross','Term','NoEmp']].agg(['min','median','max']))


# 3. Check for missing values
print("\nMissing Values:")
print("-" * 50)
missing_values = df[selected_columns].isnull().sum()
print(missing_values[missing_values > 0])

# 4. Basic statistics for numerical variables
print("\nNumerical Variables Summary:")
print("-" * 50)
print(df[numerical_continuous + numerical_discrete].describe())


In [None]:

# 1. Target Variable Summary
default_rate = (1 - df['MIS_Status'].mean()) * 100
print(f"Default Rate: {default_rate:.1f}%")
print(f"Class Distribution:\n{df['MIS_Status'].value_counts(normalize=True)}")

# 2. Key Categorical Analysis (Simplified)
key_vars = ['NewExist', 'RevLineCr', 'UrbanRural']
fig, ax = plt.subplots(1,3, figsize=(18,5))

for i, var in enumerate(key_vars):
    df.groupby(var)['MIS_Status'].mean().mul(100).plot.bar(ax=ax[i])
    ax[i].set_title(f'Default Rate by {var}')
    ax[i].set_ylabel('Default Rate (%)')

plt.tight_layout()

# 3. Loan Amount Analysis (Critical Insight)
plt.figure(figsize=(12,6))
sns.boxplot(x='NewExist', y='DisbursementGross', hue='MIS_Status',
            data=df, showfliers=False)
plt.yscale('log')
plt.title('Loan Amount Distribution by Business Type')
plt.ylabel('Loan Amount (Log Scale)')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

sns.set_palette("husl")  # Optional

# Load your data first (e.g., df = pd.read_csv('SBAnational_clean.csv'))

# Define features
numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                      'DisbursementGross', 'GrAppv', 'SBA_Appv',
                      'DaysToDisbursement']
categorical_features = ['NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc',
                        'Industry', 'Minority']

# 1. Correlation Heatmap
corr_matrix = df[numerical_features + ['MIS_Status']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.show()

# 2. Chi-Squared Test for Categorical Features
chi2_results = []
for feature in categorical_features:
    table = pd.crosstab(df[feature], df['MIS_Status'])
    chi2, p_value, _, _ = chi2_contingency(table)
    chi2_results.append({'Feature': feature, 'Chi2': chi2, 'P-value': p_value})
chi2_df = pd.DataFrame(chi2_results).sort_values('Chi2', ascending=False)
print("\nChi-Squared Test for Categorical Features")
print("-" * 50)
print(chi2_df)

# 3. Feature Importance (Random Forest)
df_model = df[numerical_features + categorical_features + ['MIS_Status']].copy()
for col in categorical_features:
    df_model[col] = LabelEncoder().fit_transform(df_model[col].astype(str))
df_model.dropna(inplace=True)

X = df_model.drop(columns=['MIS_Status'])
y = df_model['MIS_Status']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance for Predicting Default (Random Forest)')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


In [None]:
import pandas as pd
import numpy as np

# Convert date and filter for 1962-2014
df['ApprovalDate'] = pd.to_datetime(df['ApprovalDate'], format='%m/%d/%Y')

# Create new DataFrame with only 1962-2014 data
df = df[(df['ApprovalDate'].dt.year >= 1962) & (df['ApprovalDate'].dt.year <= 2014)].copy()

# Reset the index after filtering
df = df.reset_index(drop=True)

# Verify the filtering
print("Dataset Information After Filtering:")
print("-" * 50)
print(f"Total number of records: {len(df):,}")
print(f"Date range: {df['ApprovalDate'].dt.year.min()} to {df['ApprovalDate'].dt.year.max()}")
print("\nFirst few records:")
print(df[['ApprovalDate', 'DisbursementGross', 'BalanceGross', 'MIS_Status']].head())

## Question 2: Logistic Regression Modeling and Evaluation
Objective: Develop and assess a logistic regression model with varying probability thresholds, including performance metrics and validation techniques.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocess categorical variables
def preprocess_categorical_variables(df):
    df_processed = df.copy()

    # 1. NewExist: Map 1 = Existing Business, 2 = New Business, all others (including NaN) to 0
    df_processed['NewExist_Cat'] = df_processed['NewExist'].map({
        1: 1,  # Existing Business
        2: 2   # New Business
    }).fillna(0).astype(int)

    # 2. UrbanRural: Map 1 = Urban, 2 = Rural, all others (including NaN) to 0
    df_processed['UrbanRural_Cat'] = df_processed['UrbanRural'].map({
        1: 1,  # Urban
        2: 2   # Rural
    }).fillna(0).astype(int)

    # 3. RevLineCr: Filter for Yes/No, map to binary (0=No, 1=Yes)
    df_processed = df_processed[df_processed['RevLineCr'].isin([16, 12])]
    df_processed['RevLineCr_Cat'] = df_processed['RevLineCr'].map({
        16: 1,  # Yes = 1
        12: 0   # No = 0
    })

    # 4. LowDoc: Filter for Yes/No, map to binary (0=No, 1=Yes)
    df_processed = df_processed[df_processed['LowDoc'].isin([7, 4])]
    df_processed['LowDoc_Cat'] = df_processed['LowDoc'].map({
        7: 1,  # Yes = 1
        4: 0   # No = 0
    })

    return df_processed

# Preprocess the data
df_cleaned = preprocess_categorical_variables(df)

# Handle NaN values by dropping rows with missing values in selected features
features_to_check = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                     'NewExist_Cat', 'UrbanRural_Cat', 'RevLineCr_Cat', 'LowDoc_Cat',
                     'MIS_Status']
df_cleaned = df_cleaned.dropna(subset=features_to_check)

# Print data shapes
print("Data shape before preprocessing:", df.shape)
print("Data shape after preprocessing and NaN removal:", df_cleaned.shape)

# Print value counts for categorical variables
print("\nDistribution of categorical variables after preprocessing:")
for col in ['NewExist_Cat', 'UrbanRural_Cat', 'RevLineCr_Cat', 'LowDoc_Cat']:
    print(f"\n{col}:")
    counts = df_cleaned[col].value_counts()
    print(counts.rename({0: '0 (Other)', 1: '1 (Existing/Urban/Yes)', 2: '2 (New/Rural)'} if col in ['NewExist_Cat', 'UrbanRural_Cat'] else {0: '0 (No)', 1: '1 (Yes)'}))
    print(f"Default Rate by {col}:")
    default_rates = df_cleaned.groupby(col)['MIS_Status'].agg(['count', lambda x: (1-x.mean())*100])
    default_rates.columns = ['Count', 'Default Rate %']
    if col in ['NewExist_Cat', 'UrbanRural_Cat']:
        default_rates.index = default_rates.index.map({0: '0 (Other)', 1: '1 (Existing/Urban)', 2: '2 (New/Rural)'})
    else:
        default_rates.index = default_rates.index.map({0: '0 (No)', 1: '1 (Yes)'})
    print(default_rates.round(2))

# Prepare features for modeling
def prepare_features_for_modeling(df):
    # Create dummy variables for non-binary categorical features
    categorical_features = ['NewExist_Cat', 'UrbanRural_Cat']
    df_dummy = pd.get_dummies(df, columns=categorical_features, dtype=int)

    # Select numerical features
    numerical_features = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob']

    # Select binary categorical features
    binary_features = ['RevLineCr_Cat', 'LowDoc_Cat']

    # Combine numerical and categorical features
    feature_columns = numerical_features + binary_features + \
                     [col for col in df_dummy.columns if any(cat in col for cat in categorical_features)]

    X = df_dummy[feature_columns]
    y = 1 - df_dummy['MIS_Status']  # Convert to 1 for default, 0 for non-default

    return X, y

# Prepare features
X, y = prepare_features_for_modeling(df_cleaned)

print("\nFeatures used in the model:")
print(X.columns.tolist())

# Scale numerical features
scaler = StandardScaler()
features_to_scale = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob']
X_scaled = X.copy()
X_scaled[features_to_scale] = scaler.fit_transform(X[features_to_scale])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred_proba, threshold):
    y_pred = (y_pred_proba >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Calculate metrics
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity/Recall
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1 = 2 * (precision * tpr) / (precision + tpr) if (precision + tpr) > 0 else 0

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - Threshold: {threshold}')
    plt.ylabel('True Label (1=Default)')
    plt.xlabel('Predicted Label (1=Default)')
    plt.savefig(f'confusion_matrix_threshold_{threshold}.png')

    return {
        'Threshold': threshold,
        'Accuracy': accuracy,
        'TPR (Sensitivity)': tpr,
        'FPR': fpr,
        'Precision': precision,
        'F1 Score': f1
    }

# Fit logistic regression
model = LogisticRegression(solver='lbfgs', random_state=42, max_iter=2000)
model.fit(X_train, y_train)

# Get predictions
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_test_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate different thresholds
thresholds = [0.1, 0.2, 0.35, 0.5]
train_results = []
test_results = []

print("\nModel Performance:")
print("-" * 50)

for threshold in thresholds:
    train_metrics = evaluate_model(y_train, y_train_pred_proba, threshold)
    test_metrics = evaluate_model(y_test, y_test_pred_proba, threshold)

    print(f"\nThreshold: {threshold}")
    print("\nTraining Set Metrics:")
    for metric, value in train_metrics.items():
        print(f"{metric}: {value:.3f}")

    print("\nTest Set Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.3f}")

    train_results.append(train_metrics)
    test_results.append(test_metrics)

# Cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc')
print("\nCross-validation Results:")
print(f"ROC AUC scores: {cv_scores.round(3)}")
print(f"Mean ROC AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2,
         label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('roc_curve.png')

# Feature importance analysis
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# ROC and AUC explanation
print("\nROC and AUC Explanation:")
print("-" * 50)
print("The ROC curve plots the True Positive Rate (TPR, or sensitivity) against the False Positive Rate (FPR) across all thresholds.")
print("TPR measures the proportion of actual defaults correctly identified (TP / (TP + FN)).")
print("FPR measures the proportion of non-defaults incorrectly classified as defaults (FP / (FP + TN)).")
print("The AUC quantifies the model’s ability to distinguish defaults from non-defaults:")
print("- AUC = 0.5: No discrimination (random guessing).")
print("- AUC > 0.7: Good discrimination.")
print("- AUC > 0.9: Excellent discrimination.")
print(f"Our model’s test AUC (~{roc_auc:.3f}) indicates its overall performance.")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Style
plt.style.use("ggplot")  # Try using a different style like 'ggplot' or 'bmh'
sns.set_context("notebook")

# 1. Performance comparison across thresholds
thresholds = [0.1, 0.2, 0.35, 0.5]

# All variables (Test)
all_vars = {
    "Accuracy": [0.557, 0.769, 0.838, 0.842],
    "TPR": [0.908, 0.819, 0.547, 0.244],
    "FPR": [0.520, 0.242, 0.098, 0.027],
    "Precision": [0.278, 0.427, 0.553, 0.669],
    "F1 Score": [0.426, 0.562, 0.550, 0.358]
}

# Selected variables (Test)
sel_vars = {
    "Accuracy": [0.445, 0.703, 0.785, 0.785],
    "TPR": [0.964, 0.878, 0.659, 0.394],
    "FPR": [0.740, 0.360, 0.170, 0.075],
    "Precision": [0.318, 0.466, 0.582, 0.653],
    "F1 Score": [0.478, 0.609, 0.618, 0.492]
}

metrics = ["Accuracy", "TPR", "FPR", "Precision", "F1 Score"]

# Plot
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, metric in enumerate(metrics):
    axes[i].plot(thresholds, all_vars[metric], label='All Variables', marker='o', linewidth=2)
    axes[i].plot(thresholds, sel_vars[metric], label='Selected Variables', marker='s', linewidth=2)
    axes[i].set_title(f'{metric} vs Threshold', fontsize=14)
    axes[i].set_xlabel('Threshold')
    axes[i].set_ylabel(metric)
    axes[i].legend()
    axes[i].grid(True)

# Hide the 6th subplot
axes[-1].axis('off')

plt.tight_layout()
plt.suptitle("Model Performance Comparison: All vs Selected Variables", fontsize=16, y=1.02)

# Save the figure before showing it
plt.savefig('performance_comparison.png')

# Display the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Group feature importance function
def group_feature_importance_ascending(feature_names, importances):
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': np.abs(importances)
    })

    # Define grouping logic for dummy variables
    grouped = {}
    for _, row in importance_df.iterrows():
        feature = row['Feature']
        importance = row['Importance']

        # Extract base name before the last underscore if it's a dummy var
        if '_Cat_' in feature:
            base = feature.split('_Cat_')[0] + '_Cat'
        else:
            base = feature

        grouped[base] = grouped.get(base, 0) + importance

    # Convert to DataFrame and sort by importance
    grouped_df = pd.DataFrame(list(grouped.items()), columns=['Feature Group', 'Total Importance'])
    grouped_df = grouped_df.sort_values('Total Importance', ascending=True)
    return grouped_df

# Grouping for the 'all variables' and 'selected variables' importance
all_features = ['Term', 'UrbanRural_Cat_0', 'LowDoc_Cat', 'NewExist_Cat_0', 'CreateJob',
                'RevLineCr_Cat', 'RetainedJob', 'NoEmp', 'NewExist_Cat_1',
                'UrbanRural_Cat_2', 'NewExist_Cat_2', 'UrbanRural_Cat_1', 'DisbursementGross']
all_importances = [1.95, 1.20, 1.07, 0.82, 0.74, 0.68, 0.61, 0.41, 0.31, 0.29, 0.25, 0.12, 0.06]

sel_features = ['Term', 'RevLineCr_1', 'NoEmp', 'GrAppv', 'SBA_Appv', 'UrbanRural_1', 'RetainedJob',
                'DisbursementGross', 'CreateJob', 'LowDoc_1', 'NewExist_1.0']
sel_importances = [2.14, 0.39, 0.38, 0.25, 0.20, 0.18, 0.08, 0.06, 0.06, 0.04, 0.01]

# Group feature importance for all variables and selected variables
grouped_all = group_feature_importance_ascending(all_features, all_importances)
grouped_selected = group_feature_importance_ascending(sel_features, sel_importances)

# Combine both grouped DataFrames into one for comparison
feat_all_grouped = pd.DataFrame({
    "Feature Group": grouped_all['Feature Group'],
    "Importance": grouped_all['Total Importance'],
    "Model": "All Variables"
})

feat_sel_grouped = pd.DataFrame({
    "Feature Group": grouped_selected['Feature Group'],
    "Importance": grouped_selected['Total Importance'],
    "Model": "Selected Variables"
})

# Combine both DataFrames into a single DataFrame
feature_importance_comparison = pd.concat([feat_all_grouped, feat_sel_grouped])

# Plot the comparison of feature importance for grouped variables
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature Group', hue='Model', data=feature_importance_comparison, palette='viridis')
plt.title('Grouped Feature Importance Comparison (All vs. Selected Variables)', fontsize=16)
plt.xlabel('Total Absolute Coefficient Value', fontsize=12)
plt.ylabel('Feature Group', fontsize=12)
plt.tight_layout()
plt.savefig('grouped_feature_importance_comparison.png')
plt.show()

## Question 3: Alternative Machine Learning Models and Comparative Analysis
Objective: Implement and evaluate three alternative machine learning models, comparing their performance against the logistic regression model.

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocess categorical variables
def preprocess_categorical_variables(df):
    df_processed = df.copy()

    # Map NewExist to single feature: 1 = Existing, 2 = New, 0 = Other
    df_processed['NewExist_Cat'] = df_processed['NewExist'].map({
        1: 1,  # Existing Business
        2: 2   # New Business
    }).fillna(0).astype(int)

    # Map UrbanRural to single feature: 1 = Urban, 2 = Rural, 0 = Other
    df_processed['UrbanRural_Cat'] = df_processed['UrbanRural'].map({
        1: 1,  # Urban
        2: 2   # Rural
    }).fillna(0).astype(int)

    # Filter and map RevLineCr: 1 = Yes, 0 = No
    df_processed = df_processed[df_processed['RevLineCr'].isin([16, 12])]
    df_processed['RevLineCr_Cat'] = df_processed['RevLineCr'].map({
        16: 1,  # Yes
        12: 0   # No
    })

    # Filter and map LowDoc: 1 = Yes, 0 = No
    df_processed = df_processed[df_processed['LowDoc'].isin([7, 4])]
    df_processed['LowDoc_Cat'] = df_processed['LowDoc'].map({
        7: 1,  # Yes
        4: 0   # No
    })

    return df_processed

# Preprocess the data
df_cleaned = preprocess_categorical_variables(df)

# Handle NaN values
features_to_check = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob',
                     'NewExist_Cat', 'UrbanRural_Cat', 'RevLineCr_Cat', 'LowDoc_Cat',
                     'MIS_Status', 'SBA_Appv', 'GrAppv']
df_cleaned = df_cleaned.dropna(subset=features_to_check)

# Print data shapes
print("Data shape before preprocessing:", df.shape)
print("Data shape after preprocessing and NaN removal:", df_cleaned.shape)

# Determine test size
# If df is less than 400 thousand parameters use 10% of the data
dataset_size = df_cleaned.shape[0]
test_size = 0.3 if dataset_size < 400000 else 0.1
print("Using test size: {}% (dataset size: {})".format(test_size*100, dataset_size))

# Prepare features for modeling
def prepare_features_for_modeling(df):
    numerical_features = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'SBA_Appv', 'GrAppv']
    categorical_features = ['NewExist_Cat', 'UrbanRural_Cat']
    binary_features = ['RevLineCr_Cat', 'LowDoc_Cat']

    feature_columns = numerical_features + categorical_features + binary_features

    X = df[feature_columns]
    y = 1 - df['MIS_Status']  # 1 = Default, 0 = Non-default

    return X, y

# Prepare features
X, y = prepare_features_for_modeling(df_cleaned)

print("\nFeatures used in the model:")
print(X.columns.tolist())

# Scale numerical features
# To make all the numerical features on the same scale
scaler = StandardScaler()
features_to_scale = ['DisbursementGross', 'Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'SBA_Appv', 'GrAppv']
X_scaled = X.copy()
X_scaled[features_to_scale] = scaler.fit_transform(X[features_to_scale])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=test_size,
                                                    random_state=42,
                                                    stratify=y)

# Function to evaluate model performance
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]

    thresholds = [0.1, 0.2, 0.35, 0.5]
    test_results = []

    for threshold in thresholds:
        y_pred = (y_test_pred_proba >= threshold).astype(int)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        f1 = 2 * (precision * tpr) / (precision + tpr) if (precision + tpr) > 0 else 0

        test_results.append({
            'Threshold': threshold,
            'Accuracy': accuracy,
            'TPR (Sensitivity)': tpr,
            'FPR': fpr,
            'Precision': precision,
            'F1 Score': f1
        })

    # Cross-validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc')

    # ROC AUC
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Feature importance
    feature_importance = model.feature_importances_ if hasattr(model, 'feature_importances_') else None

    return test_results, roc_auc, cv_scores, feature_importance, fpr, tpr

# Define models and hyperparameters
models = {
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=42),
        'param_name': 'max_depth',
        'param_values': [3, 4, 5, 6, 7, 8, 10, 12, 15, 20] # testing till different depths of the tree
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42, n_estimators=50),# lowering the estimators to save the running time
        'param_name': 'max_depth',
        'param_values': [3, 5, 10, 15, 20]
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'param_name': 'max_depth',
        'param_values': [3, 5, 7, 10]
    }
}

# Evaluate all models
all_results = {}

for model_name, model_info in models.items():
    print("\nEvaluating {}...".format(model_name))
    param_results = []

    for param_value in model_info['param_values']:
        model = model_info['model']
        setattr(model, model_info['param_name'], param_value)

        results, roc_auc, cv_scores, feature_importance, fpr, tpr = evaluate_model(
            model, X_train, X_test, y_train, y_test, model_name
        )

        param_results.append({
            'param_value': param_value,
            'test_results': results,
            'roc_auc': roc_auc,
            'cv_scores': cv_scores,
            'feature_importance': feature_importance,
            'fpr': fpr,
            'tpr': tpr
        })

    # Find best parameter based on F1 score at threshold 0.2
    best_f1 = -1
    best_param = None
    best_results = None
    best_auc = None
    best_cv_scores = None
    best_feature_importance = None
    best_fpr = None
    best_tpr = None

    for result in param_results:
        thresh_02 = next(r for r in result['test_results'] if r['Threshold'] == 0.2)
        f1_score = thresh_02['F1 Score']
        if f1_score > best_f1:
            best_f1 = f1_score
            best_param = result['param_value']
            best_results = result['test_results']
            best_auc = result['roc_auc']
            best_cv_scores = result['cv_scores']
            best_feature_importance = result['feature_importance']
            best_fpr = result['fpr']
            best_tpr = result['tpr']

    all_results[model_name] = {
        'param_results': param_results,
        'best_param': best_param,
        'best_f1': best_f1,
        'best_results': best_results,
        'best_auc': best_auc,
        'best_cv_scores': best_cv_scores,
        'best_feature_importance': best_feature_importance,
        'best_fpr': best_fpr,
        'best_tpr': best_tpr
    }

    # Print detailed results for each parameter
    print("\nDetailed {} Results:".format(model_name))
    print("-" * 50)
    for result in param_results:
        param = result['param_value']
        print("\n{}: {}".format(model_info['param_name'], param))
        print("Test AUC: {:.3f}".format(result['roc_auc']))
        print("Mean CV AUC: {:.3f} (+/- {:.3f})".format(result['cv_scores'].mean(), result['cv_scores'].std() * 2))
        print("Test Set Metrics:")
        for thresh_result in result['test_results']:
            print("  Threshold: {}".format(thresh_result['Threshold']))
            for metric, value in thresh_result.items():
                print("    {}: {:.3f}".format(metric, value))

    # Print best parameter
    print("\nBest {}: {} (F1 Score at Threshold 0.2: {:.3f})".format(model_info['param_name'], best_param, best_f1))

    # Print and plot feature importance for best parameter
    if best_feature_importance is not None:
        print("\nFeature Importance for Best {} ({}={}):".format(model_name, model_info['param_name'], best_param))
        print("-" * 50)
        feature_importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': best_feature_importance
        }).sort_values(by='Importance', ascending=False)
        print(feature_importance_df)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
        plt.title('Feature Importance for {} ({}={})'.format(model_name, model_info['param_name'], best_param))
        plt.show()

    # Plot metrics vs parameter values (threshold = 0.2)
    metrics = ['F1 Score', 'Accuracy', 'TPR (Sensitivity)', 'FPR', 'Precision']
    metric_values = {metric: [] for metric in metrics}

    for result in param_results:
        thresh_02 = next(r for r in result['test_results'] if r['Threshold'] == 0.2)
        for metric in metrics:
            metric_values[metric].append(thresh_02[metric])

    for metric in metrics:
        plt.figure(figsize=(8, 6))
        plt.plot(model_info['param_values'], metric_values[metric], marker='o')
        plt.xlabel(model_info['param_name'])
        plt.ylabel(metric)
        plt.title('{}: {} vs {} (Threshold = 0.2)'.format(model_name, metric, model_info['param_name']))
        plt.grid(True)
        plt.show()

    # Plot metrics vs thresholds for best parameter
    thresholds = [0.1, 0.2, 0.35, 0.5]
    for metric in metrics:
        plt.figure(figsize=(8, 6))
        metric_vals = [r[metric] for r in best_results]
        plt.plot(thresholds, metric_vals, marker='o')
        plt.xlabel('Threshold')
        plt.ylabel(metric)
        plt.title('{} ({}={}): {} vs Threshold'.format(model_name, model_info['param_name'], best_param, metric))
        plt.grid(True)
        plt.show()

# Plot ROC curves for all models (best parameters)
plt.figure(figsize=(10, 6))
for model_name, result in all_results.items():
    plt.plot(result['best_fpr'], result['best_tpr'], lw=2,
             label='{} (AUC = {:.3f})'.format(model_name, result['best_auc']))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend(loc="lower right")
plt.show()

# Print final summary
print("\nFinal Summary for All Models:")
print("=" * 50)
for model_name, result in all_results.items():
    print("\n{} (Best {}={}):".format(model_name, models[model_name]['param_name'], result['best_param']))
    print("-" * 50)
    print("Test AUC: {:.3f}".format(result['best_auc']))
    print("Mean CV AUC: {:.3f} (+/- {:.3f})".format(result['best_cv_scores'].mean(), result['best_cv_scores'].std() * 2))
    for thresh_result in result['best_results']:
        print("\nThreshold: {}".format(thresh_result['Threshold']))
        print("Test Set Metrics:")
        for metric, value in thresh_result.items():
            print("  {}: {:.3f}".format(metric, value))

## Question 4: Evaluation of Unsupervised Learning Techniques
Objective: Assess the theoretical and empirical potential of unsupervised learning methods to enhance predictive modeling for loan default prediction.

## PCA Part

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(" ")

# Create target variable from MIS_Status (0 = default, 1 = not default)
df['Default'] = df['MIS_Status'].map({0: 'Defaulted', 1: 'Not Defaulted'})

# Drop date columns and identifiers
df = df.drop(columns=['ApprovalDate', 'ChgOffDate', 'DisbursementDate', 'LoanNr_ChkDgt', 'Name'])

# Select features available at application time
application_time_columns = [
    'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode',
    'UrbanRural', 'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv',
    'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS', 'Industry', 'Minority'
]
X = df[application_time_columns].copy()
y = df['Default']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# PCA with enough components to explain 95% variance (here, 10 for visualization)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_processed)

# Scree plot (Cumulative Explained Variance)
plt.figure(figsize=(10, 6))
plt.plot(
    range(1, len(pca.explained_variance_ratio_) + 1),
    np.cumsum(pca.explained_variance_ratio_), 'o-'
)
plt.axhline(y=0.95, color='r', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot - Cumulative Explained Variance')
plt.grid(True)
plt.tight_layout()
plt.show()

# First two PCs for visualization (PC1 vs PC2 scatter, color-coded by Default status)
pca_df = pd.DataFrame(X_pca[:, :2], columns=['PC1', 'PC2'])
pca_df['Default'] = y.reset_index(drop=True)

color_mapping = {'Defaulted': 'red', 'Not Defaulted': 'blue'}

plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='PC1', y='PC2',
    hue='Default',
    data=pca_df,
    alpha=0.6,
    palette=color_mapping
)
plt.title("PCA of SBA Loan Data (PC1 vs PC2) by Default Status")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)")
plt.grid(True)
plt.legend(title='Default Status')
plt.tight_layout()
plt.show()

def biplot(projected_data, loadings, feature_names, y_labels, scale=5):
    """
    Creates a biplot showing PCA-projected data and feature loadings.
    Args:
        projected_data (array): PCA-transformed data (e.g., X_pca).
        loadings (array): PCA loadings matrix (components).
        feature_names (list): Names of original features.
        y_labels (Series): Default status labels for coloring.
        scale (float): Scaling factor for loadings arrows.
    """
    plt.figure(figsize=(12, 8))
    # Use lighter colors for points
    color_dict = {'Defaulted': '#FF9999', 'Not Defaulted': '#87CEFA'}  # light red and light blue
    colors = [color_dict[val] for val in y_labels]
    plt.scatter(projected_data[:, 0], projected_data[:, 1],
                c=colors, alpha=0.5, edgecolor='k', s=40, label=None)
    # Plot feature loadings as arrows
    for i, feature in enumerate(feature_names):
        plt.arrow(0, 0, loadings[i, 0] * scale, loadings[i, 1] * scale,
                  color='black', alpha=0.8, head_width=0.2, length_includes_head=True)
        plt.text(loadings[i, 0] * scale * 1.18, loadings[i, 1] * scale * 1.18,
                 feature, color='black', ha='center', va='center', fontsize=10, weight='bold')
    plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)")
    plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)")
    plt.title("PCA Biplot of SBA Loan Data")
    plt.grid(True, linestyle='--', alpha=0.6)
    # Custom legend for Defaulted/Not Defaulted
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label='Defaulted', markerfacecolor='#FF9999', markeredgecolor='k', markersize=10),
        Line2D([0], [0], marker='o', color='w', label='Not Defaulted', markerfacecolor='#87CEFA', markeredgecolor='k', markersize=10)
    ]
    plt.legend(handles=legend_elements, title='Default Status')
    plt.tight_layout()
    plt.show()


# Get feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
feature_names_clean = [name.split('__')[-1] for name in feature_names]
loadings = pca.components_.T[:, :2]  # First two components

biplot(X_pca[:, :2], loadings, feature_names_clean, y.reset_index(drop=True), scale=15)


# Optional: Model evaluation with cross-validation using PCA features
y_binary = y.map({'Defaulted': 0, 'Not Defaulted': 1})
model = LogisticRegression(max_iter=1000)
scores = cross_val_score(model, X_pca, y_binary, cv=5, scoring='roc_auc')
print(f"Average ROC-AUC with PCA: {np.mean(scores):.4f}")


In [None]:
import numpy as np

# Number of top features to show
top_n = 14

# Compute the sum of squared loadings for PC1 and PC2
loading_strength = np.sum(loadings**2, axis=1)
top_features_idx = np.argsort(loading_strength)[-top_n:]

# Filter feature names and loadings
selected_features = [feature_names_clean[i] for i in top_features_idx]
selected_loadings = loadings[top_features_idx]


In [None]:
import matplotlib.pyplot as plt
from matplotlib.patheffects import withStroke

def biplot(projected_data, loadings, feature_names, y_labels, scale=15, top_n=7):
    """
    Clean, uncluttered PCA biplot: shows only top_n features by loading strength.
    """
    plt.figure(figsize=(12, 8))
    # Lighter, more transparent points
    color_dict = {'Defaulted': '#FF9999', 'Not Defaulted': '#87CEFA'}
    colors = [color_dict[val] for val in y_labels]
    plt.scatter(projected_data[:, 0], projected_data[:, 1],
                c=colors, alpha=0.3, edgecolor='none', s=30, label=None)

    # Select top_n features by loading strength
    loading_strength = np.sum(loadings**2, axis=1)
    top_features_idx = np.argsort(loading_strength)[-top_n:]
    selected_features = [feature_names[i] for i in top_features_idx]
    selected_loadings = loadings[top_features_idx]

    # Draw arrows and labels
    for i, feature in enumerate(selected_features):
        x, y = selected_loadings[i, 0]*scale, selected_loadings[i, 1]*scale
        plt.arrow(0, 0, x, y, color='black', alpha=0.9, head_width=0.5, head_length=0.7, linewidth=2, length_includes_head=True)
        plt.text(x*1.10, y*1.10, feature, color='black', fontsize=13, weight='bold',
                 ha='center', va='center',
                 path_effects=[withStroke(linewidth=3, foreground="white")])

    plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)")
    plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)")
    plt.title("PCA Biplot of SBA Loan Data (Top Features Only)")
    plt.grid(True, linestyle='--', alpha=0.6)
    # Custom legend
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', label='Defaulted', markerfacecolor='#FF9999', markeredgecolor='k', markersize=10),
        Line2D([0], [0], marker='o', color='w', label='Not Defaulted', markerfacecolor='#87CEFA', markeredgecolor='k', markersize=10)
    ]
    plt.legend(handles=legend_elements, title='Default Status')
    plt.tight_layout()
    plt.show()

# Usage:
biplot(X_pca[:, :2], loadings, feature_names_clean, y.reset_index(drop=True), scale=15, top_n=7)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assume X_processed is your preprocessed feature matrix

# Fit PCA with enough components
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_processed)

explained_var = pca_full.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)
num_components = np.arange(1, len(explained_var) + 1)

plt.figure(figsize=(8,5))
plt.plot(num_components, cumulative_var, marker='o', color='blue', label='Cumulative Variance Explained')
plt.plot(num_components, explained_var, marker='x', color='orange', label='Individual Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Variance Explained')
plt.title('Variance Explained by PCA')
plt.legend()
plt.tight_layout()
plt.show()



## Clustering

K Means


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Load your data
df = pd.read_csv("")

# Select relevant columns for clustering (exclude identifiers and dates)
clustering_columns = [
    'Term', 'NoEmp', 'NewExist', 'CreateJob', 'RetainedJob', 'FranchiseCode',
    'UrbanRural', 'RevLineCr', 'LowDoc', 'GrAppv', 'SBA_Appv',
    'DisbursementGross', 'BalanceGross', 'DaysToDisbursement', 'Minority'
]
X = df[clustering_columns].dropna()

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
optimal_k = 3
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.grid(True)
# Highlight the optimal k
plt.axvline(x=optimal_k, color='red', linestyle='--', label=f'Optimal k = {optimal_k}')
plt.legend()
plt.show()


In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X)
X['Cluster'] = cluster_labels


In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df['Cluster'] = cluster_labels

plt.figure(figsize=(10, 7))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis', alpha=0.7)
plt.title('K-means Clusters Visualized with PCA')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid(True)
plt.show()


Hierarchial Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
import seaborn as sns
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

# Sample the data
sample_size = 500  # Adjust based on available memory/performance
X_sample = X_scaled[:sample_size]

# Create dendrogram to help decide number of clusters
plt.figure(figsize=(12, 6))
dendrogram = sch.dendrogram(sch.linkage(X_sample, method='ward'))
plt.title("Dendrogram (Sampled Data)")
plt.xlabel("Samples")
plt.ylabel("Euclidean Distance")
plt.show()


# Apply Agglomerative Clustering
hc = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X_sample)


