# Loading packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import ConfusionMatrixDisplay

# Loading input files into DataFrames

In [None]:
application_details = pd.read_csv('/kaggle/input/credit-card-approval-prediction/application_record.csv')
credit_record = pd.read_csv('/kaggle/input/credit-card-approval-prediction/credit_record.csv')

# Exploring application details dataset

In [None]:
application_details

In [None]:
application_details.info()

In [None]:
application_details.isnull().sum()

In [None]:
application_details.nunique()

## Data cleanup

In [None]:
# Removing duplicates based on ID
application_details.drop_duplicates(subset=['ID'], inplace=True)

# Removing constant variable
application_details.drop(['FLAG_MOBIL'], axis=1, inplace=True)

# Filling missing occupation types with a new label
application_details['OCCUPATION_TYPE'].fillna(value='Not identified', inplace=True)

## Checking feature histograms

In [None]:
application_details['CODE_GENDER'].hist()

In [None]:
application_details['FLAG_OWN_CAR'].hist()

In [None]:
application_details['FLAG_OWN_REALTY'].hist()

In [None]:
application_details['CNT_CHILDREN'].hist()

In [None]:
application_details['AMT_INCOME_TOTAL'].hist()

In [None]:
application_details['NAME_INCOME_TYPE'].hist(xrot=90)

In [None]:
application_details['NAME_EDUCATION_TYPE'].hist(xrot=90)

In [None]:
application_details['NAME_FAMILY_STATUS'].hist(xrot=90)

In [None]:
application_details['NAME_HOUSING_TYPE'].hist(xrot=90)

In [None]:
application_details['DAYS_BIRTH'].hist()

In [None]:
application_details['DAYS_EMPLOYED'].hist()

Days employed seem to be off, not according to the specification.

In [None]:
application_details['FLAG_WORK_PHONE'].hist()

In [None]:
application_details['FLAG_PHONE'].hist()

In [None]:
application_details['FLAG_EMAIL'].hist()

In [None]:
application_details['OCCUPATION_TYPE'].hist(xrot=90)

New 'Not identified' label is the most often result. Something must be done about it.

In [None]:
application_details['CNT_FAM_MEMBERS'].hist()

In [None]:
application_details[['DAYS_EMPLOYED','OCCUPATION_TYPE']].hist(by='OCCUPATION_TYPE', figsize=(20, 20))

It seems the issues with not identified occupation type and wrong numer of days employed are connected. I could simply remove the data, but let me give it one more shot.

In [None]:
application_details.loc[application_details['OCCUPATION_TYPE'] == 'Not identified'][['DAYS_EMPLOYED','NAME_INCOME_TYPE']].hist(by='NAME_INCOME_TYPE', figsize=(10, 10))

It looks like the most of not identified occupation type have ther income type as pensioners. They are the ones with the wrong days employed values.

In [None]:
application_details.loc[application_details['NAME_INCOME_TYPE'] == 'Pensioner'][['DAYS_EMPLOYED','OCCUPATION_TYPE']].hist(by='OCCUPATION_TYPE', figsize=(20, 20))

Most pensioners have their occupation type not identified.

# Application details feature engineering

In [None]:
# Adjusting days employed, as the people who had the wrong value (pensioners), did not work at all
application_details.loc[(application_details['DAYS_EMPLOYED'] > 0), 'DAYS_EMPLOYED'] = 0

# Creating 'Pensioner' occupation type for all pensioners, substantially reducing 'Not identified' group
application_details.loc[(application_details['NAME_INCOME_TYPE'] == 'Pensioner') & (application_details['OCCUPATION_TYPE'] == 'Not identified'), 'OCCUPATION_TYPE'] = 'Pensioner'

# Cleaned application details dataset

In [None]:
application_details

# Exploring credit record dataset

In [None]:
credit_record

In [None]:
credit_record.info()

In [None]:
credit_record.isnull().sum()

## Checking feature histograms

In [None]:
credit_record['MONTHS_BALANCE'].hist()

In [None]:
credit_record['STATUS'].hist()

In [None]:
credit_record[['ID', 'MONTHS_BALANCE']].drop_duplicates().shape[0]

# Credit record feature engineering

In [None]:
# Mapping of the statuses. 60 or more days overdue is considered as a bad debt.
status_dict = {'C' : 'Good_Debt',
               'X' : 'Good_Debt',
               '0' : 'Good_Debt',
               '1' : 'Good_Debt',
               '2' : 'Bad_Debt',
               '3' : 'Bad_Debt',
               '4' : 'Bad_Debt',
               '5' : 'Bad_Debt'}

credit_record.replace({'STATUS' : status_dict}, inplace=True)

# Mapped statused are grouped and counted by ID and STATUS values
credit_record = credit_record.value_counts(subset=['ID', 'STATUS']).unstack(fill_value=0)

In [None]:
credit_record

## Defining dependent variable to be predicted

In [None]:
# If a number of 'Good_Debt' months in the books is higher than the number of 'Bad_Debt' months, the customer gets the credit approval
credit_record.loc[(credit_record['Good_Debt'] > credit_record['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 1
credit_record.loc[(credit_record['Good_Debt'] <= credit_record['Bad_Debt']), 'CREDIT_APPROVAL_STATUS'] = 0

## Defining extra feature

In [None]:
# Number of months in books can be a factor helping to determine the credit decision
credit_record['MONTHS_IN_BOOKS'] = credit_record['Good_Debt'] + credit_record['Bad_Debt']

## Data cleanup

In [None]:
credit_record.drop(['Bad_Debt', 'Good_Debt'], axis=1, inplace=True)

# Cleaned credit record dataset

In [None]:
credit_record

In [None]:
credit_record['CREDIT_APPROVAL_STATUS'].value_counts()

The dataset is highly imbalanced. This needs to be taken into account during the model training.

# Joining the datasets

In [None]:
merged_dataset = application_details.reset_index().merge(credit_record, on='ID').set_index('index')

In [None]:
merged_dataset

In [None]:
# Dropping the obsolete ID column
merged_dataset.drop(['ID'], axis=1, inplace=True)

# One-hot encoding of the nominal data

In [None]:
one_hot = pd.get_dummies(application_details[['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE']], prefix=['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE'], drop_first=True)

In [None]:
one_hot

In [None]:
# Removing original columns
merged_dataset.drop(['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE'], axis=1, inplace=True)

In [None]:
# Joining the DataFrames
merged_dataset = merged_dataset.join(one_hot)

# Normalization of numerical data

In [None]:
merged_dataset[['CNT_CHILDREN','AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS','MONTHS_IN_BOOKS']] = pd.DataFrame(StandardScaler().fit_transform(merged_dataset[['CNT_CHILDREN','AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS','MONTHS_IN_BOOKS']]), index=merged_dataset.index, columns=['CNT_CHILDREN','AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS','MONTHS_IN_BOOKS'])

# Final dataset

In [None]:
merged_dataset

# Dividing the dataset into independent and dependent variables

In [None]:
X = merged_dataset.loc[:, merged_dataset.columns != 'CREDIT_APPROVAL_STATUS']
y = merged_dataset['CREDIT_APPROVAL_STATUS']

In [None]:
X

In [None]:
y

# Splitting train and test samples

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

# Base Linear SVC model training

In [None]:
estimator = LinearSVC()
estimator.fit(X_train, y_train.values.ravel())
score = estimator.score(X_test, y_test)
print(score)

In [None]:
ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test)

Although the accuracy is high, the model has a poor performance.

# Balanced SVC model training

In [None]:
estimator = SVC(class_weight='balanced')
estimator.fit(X_train, y_train.values.ravel())
score = estimator.score(X_test, y_test)
print(score)

In [None]:
ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test)

The model managed to identify 5 out 19 customers that should not be approved at the cost of 123 that got rejected but should be approved.

# Hyperparameter tunning

In [None]:
def svc_hyperparameter_tuning(X, Y):
    num_features = X.shape[1]    
    param_grid = {'C': [1.0, 4.0, 16.0, 64.0, 256.0], 
                  'gamma': [1/(num_features), 1/(num_features/2), 1/(num_features/4), 1/(num_features/8), 1/(num_features/16)],
                  'kernel': ['rbf'],
                  'class_weight': ['balanced'],
                  'max_iter': [4000],
                  'random_state': [1]
                 } 

    grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted', verbose=1, n_jobs = -1)
    grid.fit(X, Y)
    best_score = grid.best_score_
    best_params = grid.best_params_
    best_estimator = grid.best_estimator_

    print('Wynik', grid.best_estimator_, num_features, 'features:')
    print(grid.best_score_)

    param_grid = {'C': np.linspace(best_params['C']/2, best_params['C']*2, num=4), 
                  'gamma': np.linspace(best_params['gamma']/2, best_params['gamma']*2, num=4),
                  'kernel': ['rbf'],
                  'class_weight': ['balanced'],
                  'max_iter': [4000],
                  'random_state': [1]
                 } 

    grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted', verbose=1, n_jobs = -1)
    grid.fit(X, Y)

    print('Wynik', grid.best_estimator_, num_features, 'features:')
    print(grid.best_score_)

    param_grid = {'C': np.linspace(grid.best_params_['C'], best_params['C'], num=3), 
                  'gamma': np.linspace(grid.best_params_['gamma'], best_params['gamma'], num=3),
                  'kernel': ['rbf'],
                  'class_weight': ['balanced'],
                  'max_iter': [4000],
                  'random_state': [1]
                 }

    grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted', verbose=1, n_jobs = -1)
    grid.fit(X, Y)

    best_score = grid.best_score_
    best_params = grid.best_params_
    best_estimator = grid.best_estimator_

    print('Wynik', grid.best_estimator_, num_features, 'features:')
    print(grid.best_score_)

    return best_estimator, best_params, best_score

In [None]:
best_estimator, best_params, best_score = svc_hyperparameter_tuning(X, y)

In [None]:
print(best_score)

In [None]:
ConfusionMatrixDisplay.from_estimator(best_estimator, X_test, y_test)