In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def scale_score(raw_score, min_score=300, max_score=900):
    """Scales the raw score to fit within the specified range."""
    scaled = np.clip((raw_score - 0.5) * (max_score - min_score) + min_score, min_score, max_score)
    return int(scaled)

def preprocess_data(df):
    """Preprocesses the dataset by dropping irrelevant columns and handling missing values."""
    columns_to_drop = ['id', 'member_id', 'url', 'desc', 'title', 'zip_code', 
                       'issue_d', 'last_pymnt_d', 'next_pymnt_d', 
                       'last_credit_pull_d', 'pymnt_plan', 'emp_title']
    
    # Drop irrelevant columns and handle missing values in a single step
    df = df.drop(columns=columns_to_drop)
    
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    
    # Impute missing values for numerical and categorical columns
    df[df.select_dtypes(include=['float64', 'int64']).columns] = imputer_num.fit_transform(df.select_dtypes(include=['float64', 'int64']))
    df[df.select_dtypes(include=['object']).columns] = imputer_cat.fit_transform(df.select_dtypes(include=['object']))
    
    return df

def generate_scorecard(data_path, target_column='loan_status', test_size=0.2, random_state=42):
    """Generates a scorecard using logistic regression on the provided dataset."""
    # Load and preprocess data
    df = pd.read_csv(data_path, low_memory=False)
    df = preprocess_data(df)

    # Split data into features and target variable
    X = pd.get_dummies(df.drop(columns=[target_column]))
    y = (df[target_column] == 'Charged Off').astype(int)

    # Scale features
    X_scaled = StandardScaler().fit_transform(X)

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)

    # Define and fit logistic regression model
    model = LogisticRegression(max_iter=100, solver='liblinear')  # 'liblinear' is suitable for small datasets

    # Cross-validation for reliable AUC score
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=random_state)
    scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv)

    # Train model on full training set
    model.fit(X_train, y_train)

    # Predictions and evaluation
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Scale AUC score
    scaled_auc = scale_score(auc_score)

    # Create scorecard DataFrame
    scorecard = pd.DataFrame({
        'Metric': ['Mean AUC', 'Scaled Score', 'Confusion Matrix'],
        'Score': [scores.mean(), scaled_auc, conf_matrix.tolist()]
    })

    return scorecard

scorecard = generate_scorecard('loan_data_2015.csv')
print(scorecard)


             Metric                    Score
0          Mean AUC                 0.998708
1      Scaled Score                      596
2  Confusion Matrix  [[83660, 4], [33, 522]]


In [24]:
scorecard

Unnamed: 0,Metric,Score
0,Mean AUC,0.998708
1,Scaled Score,596
2,Confusion Matrix,"[[83660, 4], [33, 522]]"
