In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
#read data
df = pd.read_csv("./credit_train.csv")
df.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


In [3]:
#lets check unique values in each of the columns
for col in df.columns:
    print(col+" === ", df[col].unique())

Loan ID ===  ['14dd8831-6af5-400b-83ec-68e61888a048'
 '4771cc26-131a-45db-b5aa-537ea4ba5342'
 '4eed4e6a-aa2f-4c91-8651-ce984ee8fb26' ...
 '81ab928b-d1a5-4523-9a3c-271ebb01b4fb'
 'c63916c6-6d46-47a9-949a-51d09af4414f' nan]
Customer ID ===  ['981165ec-3274-42f5-a3b4-d104041a9ca9'
 '2de017a3-2e01-49cb-a581-08169e83be29'
 '5efb2b2b-bf11-4dfd-a572-3761a2694725' ...
 '3e45ffda-99fd-4cfc-b8b8-446f4a505f36'
 '1b3014be-5c07-4d41-abe7-44573c375886' nan]
Loan Status ===  ['Fully Paid' 'Charged Off' nan]
Current Loan Amount ===  [  445412.   262328. 99999999. ...   100254.   274076.       nan]
Term ===  ['Short Term' 'Long Term' nan]
Credit Score ===  [ 709.   nan  741.  721. 7290.  730.  678.  739.  728.  740.  743.  727.
  723.  747.  687.  750.  714.  724.  704.  688.  749.  746.  737.  729.
  733.  725.  745.  720.  718.  682. 7120.  680.  710.  598.  719. 6610.
  652.  736. 7380.  644.  672. 7370.  699.  751.  694.  675.  657.  748.
  666.  734.  742.  705.  731. 6240.  712.  685.  717.  722.

In [4]:
#dropping some column which cannot be used for model
df.drop(columns=['Loan ID', 'Customer ID'], axis=1, inplace=True)

In [5]:
# Lets make it a classification problem
df['Loan Status'] = df['Loan Status'].apply(lambda x: 1 if x == "Fully Paid" else 0)

In [6]:
#To get information
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loan Status,100514.0,0.769654,0.4210564,0.0,1.0,1.0,1.0,1.0
Current Loan Amount,100000.0,11760450.0,31783940.0,10802.0,179652.0,312246.0,524942.0,100000000.0
Credit Score,80846.0,1076.456,1475.404,585.0,705.0,724.0,741.0,7510.0
Annual Income,80846.0,1378277.0,1081360.0,76627.0,848844.0,1174162.0,1650663.0,165557400.0
Monthly Debt,100000.0,18472.41,12174.99,0.0,10214.1625,16220.3,24012.06,435843.3
Years of Credit History,100000.0,18.19914,7.015324,3.6,13.5,16.9,21.7,70.5
Months since last delinquent,46859.0,34.90132,21.99783,0.0,16.0,32.0,51.0,176.0
Number of Open Accounts,100000.0,11.12853,5.00987,0.0,8.0,10.0,14.0,76.0
Number of Credit Problems,100000.0,0.16831,0.482705,0.0,0.0,0.0,0.0,15.0
Current Credit Balance,100000.0,294637.4,376170.9,0.0,112670.0,209817.0,367958.8,32878970.0


In [7]:
#little bit changes required specific to any columns
df['Years in current job'] =df['Years in current job'].str.replace(r'[^\d.]+', '')
df['Years in current job']=pd.to_numeric(df['Years in current job'])
df['Years in current job']

0          8.0
1         10.0
2          8.0
3          3.0
4          5.0
          ... 
100509     NaN
100510     NaN
100511     NaN
100512     NaN
100513     NaN
Name: Years in current job, Length: 100514, dtype: float64

In [8]:
#find out numerical and categorical columns
numerical_columns = df._get_numeric_data()
categorical_columns = list(set(df.columns)-set(numerical_columns.columns))

In [9]:
#treating missing value
# from sklearn.impute import SimpleImputer
df[numerical_columns.columns.to_list()] = SimpleImputer(strategy='median').fit_transform(df[numerical_columns.columns.to_list()])
df[numerical_columns.columns.to_list()]

Unnamed: 0,Loan Status,Current Loan Amount,Credit Score,Annual Income,Years in current job,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,1.0,445412.0,709.0,1167493.0,8.0,5214.74,17.2,32.0,6.0,1.0,228190.0,416746.0,1.0,0.0
1,1.0,262328.0,724.0,1174162.0,10.0,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,1.0,99999999.0,741.0,2231892.0,8.0,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,1.0,347666.0,721.0,806949.0,3.0,8741.90,12.0,32.0,9.0,0.0,256329.0,386958.0,0.0,0.0
4,1.0,176220.0,724.0,1174162.0,5.0,20639.70,6.1,32.0,15.0,0.0,253460.0,427174.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100509,0.0,312246.0,724.0,1174162.0,6.0,16220.30,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
100510,0.0,312246.0,724.0,1174162.0,6.0,16220.30,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
100511,0.0,312246.0,724.0,1174162.0,6.0,16220.30,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
100512,0.0,312246.0,724.0,1174162.0,6.0,16220.30,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0


In [10]:
#missing values in categorical columns. Lets add another value as "unknown"
df[categorical_columns].fillna("unknown")

Unnamed: 0,Purpose,Term,Home Ownership
0,Home Improvements,Short Term,Home Mortgage
1,Debt Consolidation,Short Term,Home Mortgage
2,Debt Consolidation,Short Term,Own Home
3,Debt Consolidation,Long Term,Own Home
4,Debt Consolidation,Short Term,Rent
...,...,...,...
100509,unknown,unknown,unknown
100510,unknown,unknown,unknown
100511,unknown,unknown,unknown
100512,unknown,unknown,unknown


In [11]:
print("Numerical columns in the dataframe :",numerical_columns.columns)

Numerical columns in the dataframe : Index(['Loan Status', 'Current Loan Amount', 'Credit Score', 'Annual Income',
       'Years in current job', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')


In [12]:
print("Categorical columns in the dataframe :",categorical_columns)

Categorical columns in the dataframe : ['Purpose', 'Term', 'Home Ownership']


#### Feature Selection

I have both categorical and numerical columns.
For categorical, I need to convert it to numerical first using Label Encoder, after that we will do one hot encoding.

In [14]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict


le_dict  = defaultdict(LabelEncoder)
df[categorical_columns] = df[categorical_columns].apply(lambda x: le_dict[x.name].fit_transform(x.astype(str)))

### chi-square test

In [15]:
y = df['Loan Status']
X = df.loc[:, df.columns!='Loan Status']

In [16]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def sel_feat_chi2(df, target, k):
    chi2_features = SelectKBest(chi2, k=k)
    best_features = chi2_features.fit(df, target)
    chi_sq = pd.DataFrame(best_features.scores_, columns=["chi_square"], index=df.columns)
    chi_sq = chi_sq.reset_index()
    chi_sq.sort_values('chi_square', ascending=False)
    return chi_sq
    
chi_sq = sel_feat_chi2(X, y, 5)
chi_sq

Unnamed: 0,index,chi_square
0,Current Loan Amount,332756800000.0
1,Term,189.5114
2,Credit Score,27748500.0
3,Annual Income,201067800.0
4,Years in current job,7.263417
5,Home Ownership,250.4534
6,Purpose,125.6894
7,Monthly Debt,30014.5
8,Years of Credit History,172.2778
9,Months since last delinquent,81.67487


Here I can see the feature importance myself and using my analytics skill, I can choose among them.

#### Using RandomForestClassifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
def sel_feat_RF(df, target):
    clf = RandomForestClassifier()
    clf.fit(df, target)
    RF_feat = pd.DataFrame(clf.feature_importances_, columns = ["RF"], index = df.columns)
    RF_feat.reset_index(inplace=True)
    RF_feat.sort_values(['RF'], ascending = False)
    return RF_feat
RF_feat =sel_feat_RF(X, y)
RF_feat

Unnamed: 0,index,RF
0,Current Loan Amount,0.124941
1,Term,0.017517
2,Credit Score,0.211613
3,Annual Income,0.077671
4,Years in current job,0.038588
5,Home Ownership,0.020648
6,Purpose,0.021966
7,Monthly Debt,0.093223
8,Years of Credit History,0.085047
9,Months since last delinquent,0.051321


In [27]:
# Combine the dataframe from all and first select what method to be selected for this job
from functools import reduce
dfs = [chi_sq, RF_feat]
final_results = reduce(lambda left, right: pd.merge(left, right, on='index'), dfs)
final_results

Unnamed: 0,index,chi_square,RF
0,Current Loan Amount,332756800000.0,0.124941
1,Term,189.5114,0.017517
2,Credit Score,27748500.0,0.211613
3,Annual Income,201067800.0,0.077671
4,Years in current job,7.263417,0.038588
5,Home Ownership,250.4534,0.020648
6,Purpose,125.6894,0.021966
7,Monthly Debt,30014.5,0.093223
8,Years of Credit History,172.2778,0.085047
9,Months since last delinquent,81.67487,0.051321


In [28]:
#provide a method name and threshold to select features
select_using_method = 'RF'
threshold = 0.001
selected_features = final_results[final_results[select_using_method]>=threshold]['index']
selected_features

0              Current Loan Amount
1                             Term
2                     Credit Score
3                    Annual Income
4             Years in current job
5                   Home Ownership
6                          Purpose
7                     Monthly Debt
8          Years of Credit History
9     Months since last delinquent
10         Number of Open Accounts
11       Number of Credit Problems
12          Current Credit Balance
13             Maximum Open Credit
14                    Bankruptcies
15                       Tax Liens
Name: index, dtype: object

# Hope it helps

## Thanks