In [22]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/chrisjackson/XXXX/1_Financial Data.csv')
df2 = pd.read_csv('/Users/chrisjackson/XXXX/2_Default Data.csv')
df['default'] = np.where(df['LOAN_ID'].isin(df2['LOAN_ID']), 1, 0)


In [23]:
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
cat_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [24]:
X = df.drop(['default', 'LOAN_ID'], axis=1)
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)


In [25]:
num_features = df.select_dtypes(include=['int64', 'float64']).drop([
    'PD_RISK_RATING', 'default'], axis=1)
cat_features = df.select_dtypes(include=['object']).drop(['LOAN_ID'], axis=1)

numeric_cols = df.dtypes.apply(lambda x: x.kind in 'bifc').reset_index(
    drop=True).loc[lambda x: x == True].index
cat_cols = (df.dtypes == 'object').reset_index(
    drop=True).loc[lambda x: x == True].index

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform,  selector(dtype_exclude="object")),
        ('cat', cat_transform, selector(dtype_include="object"))])


In [29]:
X_train_pipe = preprocessor.fit_transform(X_train)
X_test_pipe = preprocessor.fit_transform(X_test)


In [35]:
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)

# columns = X_train_pipe.columns
os_data_X, os_data_y = os.fit_resample(X_train_pipe, y_train)


In [43]:
type(os_data_X)


numpy.ndarray

In [37]:
os_data_y.value_counts()


0    6910
1    6910
Name: default, dtype: int64

In [39]:
print("length of oversampled data is ", len(os_data_X))
print("Number of no default in oversampled data ",
      len(os_data_y[os_data_y == 0]))
print("Number of default", len(os_data_y[os_data_y == 1]))
print("Proportion of no default data in oversampled data is ",
      len(os_data_y[os_data_y == 0])/len(os_data_X))
print("Proportion of default data in oversampled data is ",
      len(os_data_y[os_data_y == 1])/len(os_data_X))


length of oversampled data is  13820
Number of no default in oversampled data  6910
Number of default 6910
Proportion of no default data in oversampled data is  0.5
Proportion of default data in oversampled data is  0.5


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report


In [41]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                    ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))])


In [45]:
pipeline.fit(pd.DataFrame(os_data_X), os_data_y)
y_pred = pipeline.predict(X_test_pipe)


In [46]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.96      0.98      2961
           1       0.22      0.92      0.36        39

    accuracy                           0.96      3000
   macro avg       0.61      0.94      0.67      3000
weighted avg       0.99      0.96      0.97      3000

