# Applying Data Analytic Skills
## Credit Card Predictive Analysis| Explorative Descriptive Analysis (EDA) | Vintage Analysis
## Credit Risk Classification Using-RandomForestClassifier

# Author
# Chisom Micheal Eriobu | chisomeriobu428@gmail.com

# Problem Statement
## Credit score cards are a common risk control method in the financial industry. It uses personal information and data submitted by credit card applicants to predict the probability of future defaults and credit card borrowings. The bank is able to decide whether to issue a credit card to the applicant. Credit scores can objectively quantify the magnitude of risk.
### Application of Random Forest supervised classification machine learning model to help banks determine who should get a credit card or not

### Importing Libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as sns

In [None]:
# read dataset
application_record=pd.read_csv("application_record.csv")

## Explanation of data set application_record



## | Feature Name          | Explanation                          | Remarks                                                                                         |
# |-----------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
# | ID                    | Client number                        |                                                                                                 |
# | CODE_GENDER           | Gender                               |                                                                                                 |
# | FLAG_OWN_CAR          | Is there a car                       |                                                                                                 |
# | FLAG_OWN_REALTY       | Is there a property                  |                                                                                                 |
# | CNT_CHILDREN          | Number of children                   |                                                                                                 |
# | AMT_INCOME_TOTAL      | Annual income                        |                                                                                                 |
# | NAME_INCOME_TYPE      | Income category                      |                                                                                                 |
# | NAME_EDUCATION_TYPE   | Education level                      |                                                                                                 |
# | NAME_FAMILY_STATUS    | Marital status                       |                                                                                                 |
# | NAME_HOUSING_TYPE     | Way of living                        |                                                                                                 |
# | DAYS_BIRTH            | Birthday                             | Count backwards from current day (0), -1 means yesterday                                         |
# | DAYS_EMPLOYED         | Start date of employment             | Count backwards from current day (0). If positive, it means the person is currently unemployed. |
# | FLAG_MOBIL            | Is there a mobile phone              |                                                                                                 |
# | FLAG_WORK_PHONE       | Is there a work phone                |                                                                                                 |
# | FLAG_PHONE            | Is there a phone                     |                                                                                                 |
# | FLAG_EMAIL            | Is there an email                    |                                                                                                 |
# | OCCUPATION_TYPE       | Occupation                           |                                                                                                 |
# | CNT_FAM_MEMBERS       | Family size                          |                                                                                                 |



In [None]:
# preview applicatio_record
application_record

In [None]:
# Satistic descrption
print("                                              Descriptive Satistic of application record")
describe_application=application_record.describe()
describe_application=pd.DataFrame(describe_application)


In [None]:
# Change column name to other understandable names
new_column_names = {
    "ID": "ClientID",
    "CNT_CHILDREN": "Number Of Children",
    "AMT_INCOME_TOTAL": "Annual Income",
    "DAYS_BIRTH": "Days Since Birth",
    "DAYS_EMPLOYED": "Days Since Employment",
    "FLAG_MOBIL": "Has Mobile Phone",
    "FLAG_WORK_PHONE": "Has Work Phone",
    "FLAG_PHONE": "Has Phone",
    "FLAG_EMAIL": "Has Email",
    "CNT_FAM_MEMBERS": "Family Size"
    
}
# Rename columns

describe_application.rename(columns=new_column_names, inplace=True)
#drop client number
describe_application=describe_application.drop("ClientID", axis=1)
# visualise descriptive analysis
plt.figure(figsize=(12, 16))
sns.heatmap(describe_application, cmap="tab20b", annot=True)
plt.title("Heat Map of The Descriptive Analysis of Application Record", fontweight="bold", fontsize=18)
plt.xlabel("Data Set Attributes")
plt.ylabel("Satistic Description")
plt.legend(describe_application)
plt.tight_layout()

In [None]:
# presenting data types of dataset
print(application_record.dtypes)

In [None]:
#preseting the amount of rows and columns of dataset
print(application_record.shape)

# Observation
## The dataset of application record contains 438557 rows and 18 columns
## This dataset contains eight of columns of object datatype
## And remaining 10 columns has of numric datatype

In [None]:
# check for null 
application_record.isnull().sum()

In [None]:
percent=application_record["OCCUPATION_TYPE"].isnull().sum()/application_record["OCCUPATION_TYPE"].shape[0] * 100
print(f'{percent}%' )

### The null values of column occuption_type in the data set consistit an approximate 30% droping them will make the data more clean

In [None]:
application_record=application_record.drop("OCCUPATION_TYPE", axis=1)

In [None]:
application_record.columns #check if column is droped

In [None]:
# check for duplicate data
application_record.duplicated().sum()

### No duplicate data

## Pie chart distribution of gender income

In [None]:
gender_income_=application_record.groupby("CODE_GENDER")["AMT_INCOME_TOTAL"].sum().reset_index()
gender_income_

In [None]:
male_amount=gender_income_["AMT_INCOME_TOTAL"].min()
female_amount=gender_income_["AMT_INCOME_TOTAL"].max()
print(f"The total amount of income for male is ${male_amount:.1f}")
print(f"The total amount of income for female is ${female_amount:.1f}")


In [None]:
plt.figure(figsize=(12, 15))
plt.pie(gender_income_["AMT_INCOME_TOTAL"], labels=["Female", "Male"], colors=["blue", "red"], shadow=True )
plt.title("Gender Distribution of Data Set", fontsize=17, fontweight="bold")
plt.legend([f"${female_amount}", f"${male_amount}"] )
plt.axis("equal")
plt.tight_layout()


In [None]:
# total number of unique gender
print(application_record['CODE_GENDER'].value_counts())

# Bie Chart representation of income type to total amount

In [None]:
income_amount=application_record.groupby("NAME_INCOME_TYPE")["AMT_INCOME_TOTAL"].sum().reset_index().sort_values(by="AMT_INCOME_TOTAL", ascending=True)
print(f"Income distribution among diffrent income class from least to highest")
income_amount.rename(columns={"AMT_INCOME_TOTAL" : "TOTAL INCOME($)"})
income_amount

In [None]:
from matplotlib.ticker import StrMethodFormatter
plt.figure(figsize=(12,8))
colors=["blue", "red", "green", "purple", "yellow"]
plt.bar(income_amount["NAME_INCOME_TYPE"], income_amount["AMT_INCOME_TOTAL"], label=income_amount["AMT_INCOME_TOTAL"], color=colors)
plt.yscale("log")
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('${x:,.0f}'))
plt.grid()
plt.xlabel("Total Income", fontsize=16)
plt.ylabel("Income Type", fontsize=16)
plt.title("Income Distribution Of Income Type", fontsize=23, fontweight="bold")
# add dollar sign to amount
List=list(income_amount["AMT_INCOME_TOTAL"])

plt.legend(["$"+str(i) for i in List], loc='upper left')
plt.tight_layout()

In [None]:
application_record.columns


## Renaming the columns to understandable names

In [None]:
# Reference table for column renaming
rename_dict = {
    "ID": "Client Number",
    "CODE_GENDER": "Gender",
    "FLAG_OWN_CAR": "Car Ownership",
    "FLAG_OWN_REALTY": "Property Ownership",
    "CNT_CHILDREN": "Number of Children",
    "AMT_INCOME_TOTAL": "Annual Income",
    "NAME_INCOME_TYPE": "Income Category",
    "NAME_EDUCATION_TYPE": "Education Level",
    "NAME_FAMILY_STATUS": "Marital Status",
    "NAME_HOUSING_TYPE": "Housing Type",
    "DAYS_BIRTH": "Birthday",
    "DAYS_EMPLOYED": "Employment Start Date",
    "FLAG_MOBIL": "Mobile Phone",
    "FLAG_WORK_PHONE": "Work Phone",
    "FLAG_PHONE": "Phone",
    "FLAG_EMAIL": "Email",
    "OCCUPATION_TYPE": "Occupation",
    "CNT_FAM_MEMBERS": "Family Size"
}
application_record.rename(columns=rename_dict, inplace=True)
application_record

In [None]:
# read credit record data set
credit_record=pd.read_csv("credit_record.csv")

In [None]:
credit_record

## Observations
# The data set Credit Records has 1048575 rows and three columns


In [None]:
#read data type
print("Data Types of Credit Record Columns")
print(credit_record.dtypes)

In [None]:
#check for null values
credit_record.isnull().sum()

In [None]:
# check for duplicates
credit_record.duplicated().sum()

### NO DUPLICATES AND NULL VALUES

In [None]:
# Prepare data for machine learning algorithm
credit_record.columns


In [None]:
rename_dict={"ID": "Client Number", 
             "MONTHS_BALANCE": "Record Month",
             "STATUS": "Status"}
credit_record.rename(columns=rename_dict, inplace=True)

In [None]:
credit_record

In [None]:
print("Descriptive Statistics of credit_record.csv")
credit_record.describe()

In [None]:
#merge dataset
merged_data=pd.merge(application_record, credit_record, on="Client Number", how="inner")

In [None]:
# check for duplicates
merged_data.duplicated().sum()

In [None]:
merged_data["Client Number"].duplicated().sum()

In [None]:
merged_data=merged_data.drop("Client Number", axis=1) # client number unneccesary for the machine learning model


In [None]:
merged_data #preview data

In [None]:
print(merged_data.info()) #information on data set

In [None]:
print("                                                      Descriptive Statistic of The Data Set")
describe_merged=merged_data.describe()
describe_merged

In [None]:
# use heat map to visually describe the data set
plt.figure(figsize=(12, 16))
sns.heatmap(describe_merged, cmap="tab20b", annot=True, robust=True,  )
plt.title("Heat Map of The Descriptive Analysis of Application Record", fontweight="bold", fontsize=18)
plt.xlabel("Data Set Attributes")
plt.ylabel("Satistic Description")
plt.legend(describe_merged)
plt.tight_layout()

In [None]:
status_mapping = {
    '0': '1-29 days past due',
    '1': '30-59 days past due',
    '2': '60-89 days overdue',
    '3': '90-119 days overdue',
    '4': '120-149 days overdue',
    '5': 'Overdue or bad debts, write-offs for more than 150 days',
    'C': 'Paid off that month',
    'X': 'No loan for the month'
}

merged_data['Status Description'] = merged_data['Status'].map(status_mapping)


# preview the data
merged_data


# Apply Vintage Analysis 


In [None]:


# Step 1: Prepare Data
# Create a new column for the issuance month based on the Record Month
merged_data['Issuance Month'] = merged_data['Record Month'].apply(lambda x: 'Month ' + str(abs(x)))

# Convert 'Issuance Month' to a categorical type with the correct order
merged_data['Issuance Month'] = pd.Categorical(merged_data['Issuance Month'], 
                                               categories=[f'Month {i}' for i in range(merged_data['Record Month'].abs().max() + 1)],
                                               ordered=True)

# Step 2: Create Cohorts
# Group data by issuance month and status to create cohorts
cohort_data = merged_data.groupby(['Issuance Month', 'Status']).size().unstack(fill_value=0).sort_index()

# Step 3: Aggregate Data
# Calculate the percentage of each status in each cohort
cohort_data_percentage = cohort_data.div(cohort_data.sum(axis=1), axis=0) * 100

# Step 4: Analyze and Visualize
# Plot the vintage analysis for each status
plt.figure(figsize=(12, 8))

for status in cohort_data_percentage.columns:
    plt.plot(cohort_data_percentage.index, cohort_data_percentage[status], marker='s', linestyle='-', label=f'Status {status}')

plt.title('Vintage Analysis - Status Distribution Over Time')
plt.xlabel('Issuance Month')
plt.ylabel('Percentage')
plt.legend(title='Status', loc="upper left", bbox_to_anchor=(1, 1) )
plt.grid(True)

# Rotate x-ticks for better readability
plt.xticks(rotation=90, ha='right')

# Adjust spacing
plt.tight_layout()




In [None]:
# Preview data set
merged_data.dtypes

In [None]:
 #process data by encoding 
from sklearn.preprocessing import LabelEncoder, StandardScaler
category_columns=["Housing Type", "Marital Status", "Education Level", "Income Category", "Property Ownership", 
                  "Car Ownership", "Gender", "Status"]
label_encoder={}
for column in category_columns:
    le=LabelEncoder()
    merged_data[column]=le.fit_transform(merged_data[column])
    label_encoder[column]=le
# process numeric data
_column=[ "Annual Income", "Employment Start Date", 
          "Record Month", "Birthday"]
SS=StandardScaler()
merged_data[_column]=SS.fit_transform(merged_data[_column])

In [None]:
# drop columns to promote cleanliness
#column_data=["Status Description", "Record Month"]
merged_data=merged_data.drop("Status Description", axis=1)
merged_data=merged_data.drop("Issuance Month", axis=1)
data=merged_data
data # preview data                                                                     
                      

In [None]:
# feature selection
X=data.drop('Status', axis=1)
y=data["Status"]
# split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=34)
data

In [None]:
# Train the model
from sklearn.ensemble import GradientBoostingClassifier
model=GradientBoostingClassifier(random_state=32, class_weight="balanced", n_jobs=4)


model.fit(X_train, y_train)



In [None]:
# predict on the test score
y_pred=model.predict(X_test)

In [None]:
# using metrics to to evaluate the modek
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))