In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, top_k_accuracy_score
import dill

In [2]:
pdf = pd.read_csv('../Data/cleaned_data.csv', low_memory = False)
date_cols = ["EOMDate", "EOMOpenDt_YrMo", "EOMCloseDt_YrMo", "EOMMaturityDt_YrMo", "EOMDefaultDt_YrMO"]

In [3]:
pdf.head()

Unnamed: 0,AcctID,SECSegmentDe_YrMo,SICIndustryMapDe,CAConcentrationTyDe,OpenAmt_YrMo,Unpaid_Balance_Amt,Off_Balance_Amt,EOMOpenDt_YrMo,Term_YrMo,MOB,...,IR_5Yr_Diff_LQ3,IR_5Yr_Diff_LQ4,BBB_Rate_Diff_LQ1,BBB_Rate_Diff_LQ2,BBB_Rate_Diff_LQ3,BBB_Rate_Diff_LQ4,Mort_Tr10Yr_Spread_Diff_LQ1,Mort_Tr10Yr_Spread_Diff_LQ2,Mort_Tr10Yr_Spread_Diff_LQ3,Mort_Tr10Yr_Spread_Diff_LQ4
0,100000220-000,Commercial and Industrial,SERVICE-MEDICAL,,10000.0,0.0,10000.0,2011-10-31,12.0,1.0,...,0.27,0.17,0.29,0.25,0.09,0.16,0.43,0.08,0.01,0.15
1,100000220-000,Commercial and Industrial,SERVICE-MEDICAL,,10000.0,0.0,10000.0,2011-10-31,12.0,2.0,...,0.15,0.58,0.15,0.03,0.14,0.47,0.16,0.04,0.06,0.12
2,100000220-000,Commercial and Industrial,SERVICE-MEDICAL,,10000.0,8000.0,2000.0,2011-10-31,12.0,3.0,...,0.06,0.06,0.3,0.01,0.0,0.03,0.21,0.03,0.04,0.06
3,100000220-000,Commercial and Industrial,SERVICE-MEDICAL,,10000.0,8000.0,2000.0,2011-10-31,12.0,4.0,...,0.33,0.27,0.19,0.29,0.25,0.09,0.06,0.43,0.08,0.01
4,100000220-000,Commercial and Industrial,SERVICE-MEDICAL,,10000.0,0.0,10000.0,2011-10-31,12.0,5.0,...,0.26,0.15,0.09,0.15,0.03,0.14,0.0,0.16,0.04,0.06


In [4]:
def build_p_model(P_class, X, y):
    X, y = X[X.From_Risk_Group == P_class].drop(columns = ['From_Risk_Group']), y[X.From_Risk_Group == P_class]
    
    col_transformer = ColumnTransformer([
                        ('Categorical_Encoder', OneHotEncoder(drop = 'if_binary', sparse_output = False, 
                                         handle_unknown = 'infrequent_if_exist', min_frequency = .02, max_categories = 8,
                                         feature_name_combiner = 
                                                      lambda feature, val: 
                                                          feature + "_infrequent" if val.startswith('infrequent') \
                                                                                    else val
                                    ), ['PmtModeTyDe', 'SICIndustryMapDe']),
                        ('Normalizer', PowerTransformer(), X.select_dtypes(np.number).columns.tolist()),
                    ], verbose_feature_names_out = False, verbose = True)
    
    pipeline = Pipeline([('col_transformer', col_transformer),
                         ('LogisticRegression', LogisticRegression(random_state = 1234, class_weight = "balanced", 
                                               n_jobs = 6, multi_class = 'ovr', max_iter = 1000, C = 1, penalty = 'l2'))],)
    pipeline.fit(X, y)
    
    pred_prob = pipeline.predict_proba(X)
    pred = pipeline.predict(X)

    print(classification_report(y, pred))
    print(top_k_accuracy_score(y, pred_prob, k = 1))
    print(top_k_accuracy_score(y, pred_prob, k = 2))
    with open(f"../Models/P{P_class}_model.pkl", 'wb') as f:
        dill.dump(pipeline, f)
    return None

In [5]:
X, y = pdf.drop(columns = ['AcctID', 'From_RiskCd', 'To_RiskCd', 'To_Risk_Group', 'RevolveIn', 'SECSegmentDe_YrMo',
                               'CAConcentrationTyDe', 'Default_Flag', 'Closed_Flag', 'Prepay_Flag'
                    ] + date_cols), pdf['To_Risk_Group']

In [6]:
X['PmtModeTyDe'] = (X['PmtModeTyDe'] == 'Int Only').replace({0: 'Int and Principal', 1: 'Int Only'})
X['SICIndustryMapDe'] = X.SICIndustryMapDe.str.replace("\-.*","", regex = True)

In [7]:
X_train, _, y_train, _ = train_test_split(X, y, stratify = y, random_state = 1234)

In [8]:
for p_class in ['A', 'B', 'C']:
    print(f"Class {p_class} model")
    build_p_model(p_class, X_train, y_train)

Class A model
[ColumnTransformer]  (1 of 2) Processing Categorical_Encoder, total=   0.4s
[ColumnTransformer] .... (2 of 2) Processing Normalizer, total=  33.3s
              precision    recall  f1-score   support

           A       1.00      0.82      0.90    272024
           B       0.01      0.16      0.02       716
           C       0.01      0.37      0.01       329
           D       0.01      1.00      0.03        12
           E       0.31      0.97      0.46      5276

    accuracy                           0.82    278357
   macro avg       0.27      0.67      0.28    278357
weighted avg       0.98      0.82      0.89    278357

0.8236976257108677
0.8813969111608474
Class B model
[ColumnTransformer]  (1 of 2) Processing Categorical_Encoder, total=   0.0s
[ColumnTransformer] .... (2 of 2) Processing Normalizer, total=   0.8s
              precision    recall  f1-score   support

           A       0.15      0.57      0.24       347
           B       0.95      0.56      0.7