In [None]:
#importing all the libraries that i will need to access throught the code
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,roc_auc_score, confusion_matrix,RocCurveDisplay, PrecisionRecallDisplay
import matplotlib.pyplot as plt
import seaborn as sns``
df = pd.read_csv("DPD.csv")#loading the dataset
df.describe()# understanding the features of the dataset(count,mean,min-max,etc.)
dff = df.copy()# making a copy of the actual dataset.We will work on the copy of the dataset
print(df.isnull().sum())# we see that there are 39 missing data for income and 29 for loan_balance

#next we are going to clean the dataset,impute the missing data and remove some data
dff['Loan_Balance'] = dff['Loan_Balance'].fillna(dff['Loan_Balance'].median())#45776.0
dff['Income']= dff['Income'].fillna(dff.groupby(['Employment_Status', 'Location','Credit_Card_Type'])['Income'].transform('median'))
dff = dff.dropna(subset=['Credit_Score','Income'])
dff['Employment_Status'].unique()# we find some overlapping categories which we will be removing
dff['Employment_Status'] =  dff['Employment_Status'].replace({"employed":"Employed", "EMP":"Employed"})

#now we need to check for outliers in the dataset and one common method to check using boxplots
z = [ 'Age','Income','Credit_Score','Debt_to_Income_Ratio','Credit_Utilization']# numerical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(z, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(data=df, y=col)
    plt.title(col)
plt.tight_layout()
plt.show()

#encoding the payment history features
payment_cols = ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']
for col in payment_cols:
    dff[col] = dff[col].map({'On-time': 0, 'Late': 1, 'Missed': 2})
payment_cols = ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5']
dff['Recent_Delinquency'] = dff[payment_cols].apply(lambda x: x.isin([1,2]).sum(),axis =1)#returns no of times payment is late or missed during the last 5 months
dff['Severe_Delinquency'] = (dff['Recent_Delinquency'] >=2).astype(int)# considering recent delinquencies more than 2 as severe delinquency-returns 0 if true and 1 if false
dff[['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6','Recent_Delinquency','Severe_Delinquency']]

#TRYING TO FIND CORRELATION BETWEEN VARIABLES
corr_matrix = dff[z + ['Severe_Delinquency']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix with Delinquency')
plt.show()

employ = plt.pie(df['Employment_Status'].value_counts(),labels = df['Employment_Status'].value_counts().index,autopct = '%1.1f')
# Employment Status Risk
fig = plt.figure(figsize =(8,4))
plt.subplot(1, 2, 1)
employment_risk = dff.groupby('Employment_Status')['Severe_Delinquency'].mean().sort_values()
employment_risk.plot(kind='bar')
plt.title('Delinquency Rate by Employment Status')

# Age Group Analysis
dff['Age_Group'] = pd.cut(df['Age'], bins=[18, 25, 35, 50, 65, 75])#Cutting the ages into smaller bins 
plt.subplot(1, 2, 2)
dff.groupby('Age_Group')['Severe_Delinquency'].mean().plot(kind='bar')
plt.title('Delinquency Rate by Age Group')
plt.tight_layout()
plt.show()
#Understanding Credit Utilisation according to age and employmnet status
dff.groupby('Age_Group')['Credit_Utilization'].mean().plot(kind = 'bar')
dff.groupby('Employment_Status')['Credit_Utilization'].mean().plot(kind = 'line')

#Now we need to create dummies for multiple category features for the upcoming model to not misinterpret the relation
dff = pd.get_dummies(dff,columns = ['Employment_Status','Credit_Card_Type' , 'Location'])
print("\nColumns:", dff.columns.tolist())


