In [1]:
#loading data
import pandas as pd

df= pd.read_csv("../Data/Raw/Loan_default.csv")
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,SWWFW99YN1,43,112742,107552,300,0,1,15.21,48,0.89,PhD,Unemployed,Divorced,No,No,Home,Yes,0
1,DSL4O0KAWD,64,73743,140354,300,0,2,4.12,12,0.24,PhD,Self-employed,Single,Yes,No,Education,Yes,0
2,S5IDRA0LCA,22,46351,151652,300,0,4,10.04,36,0.51,PhD,Part-time,Single,No,Yes,Other,Yes,0
3,4X63ZOH9Y4,56,42359,163692,300,0,3,4.98,60,0.71,PhD,Part-time,Divorced,No,Yes,Education,No,0
4,7QM3SGA1AU,23,52705,218580,300,0,3,22.31,60,0.54,High School,Unemployed,Divorced,No,Yes,Business,Yes,0


In [2]:
df.shape

(255347, 18)

In [3]:
df.isnull()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
255343,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
255344,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
255345,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
#Flagging variable
df['hard_invalid_flag']=0
df['soft_anomaly_flag']=0
df['validation_reason']=""

In [5]:
#Hard Flagging
#Rule 1:Age < 21 with PhD
condition= (df['Age']<21)&(df['Education']=="PhD")
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Age < 21 with PhD;"

In [6]:
#Hard Flagging
#Rule 2: Unemployed but MonthsEmployed > 0
condition= (df['EmploymentType']=="Unemployed")&(df['MonthsEmployed']>0)
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Unemployed with employment history;"

In [7]:
#Hard Flagging
#Rule 3:MonthsEmployed exceeds working-age limit
condition= df['MonthsEmployed']> (df['Age']-14)*12
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Months employed exceeds age limit;"

In [8]:
#Hard Flagging
#Rule 4: Age < 21 and married/divorced
condition= (df['Age']<21)&(df['MaritalStatus'].isin(["Married", "Divorced"]))
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Unrealistic Marital Status for age;"

In [9]:
#Hard Flagging
#Rule 5: Home loan with very short tenure
condition= (df['LoanPurpose']=="Home")&(df['LoanTerm']<120)
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Home loan short tenure;"

In [10]:
#Hard Flagging
#Rule 6: Mortgage ownership at very young age
condition= (df['Age']<21)&(df['HasMortgage']=="Yes")
df.loc[condition,"hard_invalid_flag"]=1
df.loc[condition,"validation_reason"]+= "Mortgage at unrealistic age;"

In [11]:
#Soft Anomalies
#Rule 7: High DTI
condition= df['DTIRatio'] >0.6
df.loc[condition,"soft_anomaly_flag"]=1
df.loc[condition,"validation_reason"]+= "High DTI;"

In [12]:
#Soft Anomalies
#Rule 8: Loan amount exceeds income
condition= df['LoanAmount']> df['Income']
df.loc[condition,"soft_anomaly_flag"]=1
df.loc[condition,"validation_reason"]+= "Loan Amount Exceeds Income;"

In [13]:
#Soft Anomalies
#Rule 9: High credit score but defaulted
condition= (df['CreditScore']>700)&( df['Default']==1)
df.loc[condition,"soft_anomaly_flag"]=1
df.loc[condition,"validation_reason"]+= "High credit score yet loan defaulted;"

In [14]:
#Updated Datasets
#Flagged dataset with hard invalid flag
flagged_dataset = df.copy()
flagged_dataset.to_csv("../Data/Processed/loan_flagged_dataset.csv", index=False)
df.head

<bound method NDFrame.head of             LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0       SWWFW99YN1   43  112742      107552          300               0   
1       DSL4O0KAWD   64   73743      140354          300               0   
2       S5IDRA0LCA   22   46351      151652          300               0   
3       4X63ZOH9Y4   56   42359      163692          300               0   
4       7QM3SGA1AU   23   52705      218580          300               0   
...            ...  ...     ...         ...          ...             ...   
255342  0EYCL022UL   56   49546       93890          848             119   
255343  4K5YHA2OYS   29   99252      107216          848             119   
255344  2Z578Y0CMK   32   51735      151492          848             119   
255345  IK0XWQGYVK   31  104136      181314          848             119   
255346  GK8JXLUE8V   38  112736       73075          849             119   

        NumCreditLines  InterestRate  LoanTerm  DTIRatio 

In [15]:
#Clean dataset after removing hard invalid flag
clean_dataset = df[df["hard_invalid_flag"] == 0].copy()
clean_dataset.to_csv("../Data/Processed/loan_clean_dataset.csv", index=False)
clean_dataset.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,...,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default,hard_invalid_flag,soft_anomaly_flag,validation_reason
1,DSL4O0KAWD,64,73743,140354,300,0,2,4.12,12,0.24,...,Self-employed,Single,Yes,No,Education,Yes,0,0,1,Loan Amount Exceeds Income;
2,S5IDRA0LCA,22,46351,151652,300,0,4,10.04,36,0.51,...,Part-time,Single,No,Yes,Other,Yes,0,0,1,Loan Amount Exceeds Income;
3,4X63ZOH9Y4,56,42359,163692,300,0,3,4.98,60,0.71,...,Part-time,Divorced,No,Yes,Education,No,0,0,1,High DTI;Loan Amount Exceeds Income;
4,7QM3SGA1AU,23,52705,218580,300,0,3,22.31,60,0.54,...,Unemployed,Divorced,No,Yes,Business,Yes,0,0,1,Loan Amount Exceeds Income;
5,CDH9OTQ8H6,58,131487,15557,301,0,4,21.35,36,0.38,...,Part-time,Divorced,No,No,Other,Yes,0,0,0,
