### Objective: choose the best algorithm out of Dtree, Logistic Regression & SVM with right hyperparameters for classifying credit risk customers as loan accepted and rejected

In [1]:
#for loading & visualization
import pandas as pd
import plotly.express as pe

#for preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

#for dimensionality reduction
from sklearn.decomposition import PCA

#for model algorithms
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#for hyperopt functions
from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval

#for stopping hyperopt early
from hyperopt.early_stop import no_progress_loss

#for metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score


In [2]:
path = "/home/harshit/Desktop/TataSteelML2023/dataset/Balanced_credit_Risk.csv"
df = pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,index,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,0,0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,1,2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,2,3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,3,4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,4,5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12401,12401,26464,30,102540,MORTGAGE,6.0,HOMEIMPROVEMENT,A,1500,7.90,0,0.01,N,5
12402,12402,12567,24,60000,RENT,0.0,PERSONAL,B,12000,12.21,0,0.20,N,2
12403,12403,6443,22,40000,RENT,0.0,EDUCATION,C,6000,12.87,0,0.15,Y,3
12404,12404,8967,22,50000,RENT,2.0,PERSONAL,C,8000,13.16,0,0.16,Y,2


## step 2: Data exploration & preprocessing

In [3]:
print(df.shape, df.columns, df.index, sep="\n")

(12406, 14)
Index(['Unnamed: 0', 'index', 'person_age', 'person_income',
       'person_home_ownership', 'person_emp_length', 'loan_intent',
       'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_status',
       'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')
RangeIndex(start=0, stop=12406, step=1)


Conclusion: 200 rows and 6 columns with row numbers set as index

In [4]:
display(df.info())

display(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12406 entries, 0 to 12405
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  12406 non-null  int64  
 1   index                       12406 non-null  int64  
 2   person_age                  12406 non-null  int64  
 3   person_income               12406 non-null  int64  
 4   person_home_ownership       12406 non-null  object 
 5   person_emp_length           12406 non-null  float64
 6   loan_intent                 12406 non-null  object 
 7   loan_grade                  12406 non-null  object 
 8   loan_amnt                   12406 non-null  int64  
 9   loan_int_rate               12406 non-null  float64
 10  loan_status                 12406 non-null  int64  
 11  loan_percent_income         12406 non-null  float64
 12  cb_person_default_on_file   12406 non-null  object 
 13  cb_person_cred_hist_length  124

None

Unnamed: 0                    0
index                         0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

Conclusion: No missing data in the dataset

In [5]:
display(df.nunique())

Unnamed: 0                    12406
index                         12406
person_age                       50
person_income                  2184
person_home_ownership             4
person_emp_length                33
loan_intent                       6
loan_grade                        7
loan_amnt                       576
loan_int_rate                   340
loan_status                       2
loan_percent_income              77
cb_person_default_on_file         2
cb_person_cred_hist_length       29
dtype: int64

Age & Na_to_K are real-value columns

Drug is the target (categorical)

BP, Cholesterol & Sex are categorical feature columns

### Visualizing relation between feature & target

#  step 2b) Preprocess the data

In [6]:
categorical_features=["person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file"]

real_value_features=["person_age","person_income","person_emp_length","loan_amnt","loan_int_rate","loan_percent_income","cb_person_cred_hist_length"]

### Scaling of features

In [7]:
sc = StandardScaler()

for col in real_value_features:
    df[[col]] = sc.fit_transform(df[[col]])

display(df[real_value_features].describe())

fig = pe.box(y=real_value_features, data_frame=df)
display(   fig    )

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,12406.0,12406.0,12406.0,12406.0,12406.0,12406.0,12406.0
mean,2.164962e-16,7.331087e-17,9.163859e-17,0.0,3.665544e-17,-7.331087e-17,5.154671e-17
std,1.00004,1.00004,1.00004,1.00004,1.00004,1.00004,1.00004
min,-1.215713,-1.266214,-1.076688,-1.402743,-1.877024,-1.6197,-0.9176119
25%,-0.735477,-0.5789326,-0.6039438,-0.77517,-0.8389782,-0.7999015,-0.6725704
50%,-0.2552408,-0.2130281,-0.1311994,-0.207366,0.01328117,-0.2260426,-0.4275289
75%,0.3850741,0.282754,0.5779174,0.569628,0.7770707,0.6757357,0.5526371
max,18.63405,29.21928,27.9971,3.707491,3.372186,5.184627,5.94355


### categorical columns encoding

In [8]:
le = LabelEncoder()

for col in categorical_features:
    df[col] = le.fit_transform(df[col])

df[categorical_features]

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,3,4,3,1
1,0,3,2,0
2,3,3,2,0
3,3,3,2,1
4,2,5,0,0
...,...,...,...,...
12401,0,2,0,0
12402,3,4,1,0
12403,3,1,2,1
12404,3,4,2,1


In [9]:
model = PCA(n_components=5)

ans = model.fit_transform(df[real_value_features])

result = pd.DataFrame(ans, columns=['PCA1', "PCA2", "PCA3", "PCA4", "PCA5"])

print(model.explained_variance_ratio_)

[0.28498837 0.22897119 0.18305285 0.1435785  0.11808111]


In [10]:
df.loan_status.value_counts(normalize=True)

1    0.5
0    0.5
Name: loan_status, dtype: float64

### creation of search space

## objective: to create 2 sets of parameters for 2 different algorithms.
            Hyperopt can only select one set at a time.
            Depending on whether we have set of SVM or set of logit model, operations also differ

steps: 
    a) Create separate dictionaries with parameters and their available options
    b) add a model_type key inside the dictionary to label the dictionary
    c) put all the dictionaries into a list/tuple to specify that this is a collection of parameter sets!
    d) since we need to choice one entry from list/tuple created in step c, use hp.choice on this list

In [11]:
#keys of this dictionary are names of parameters
space =hp.choice( 'algorithm',
    [
        {
        'model_type' : "SVM",
        'C' : hp.uniform(  'alpha parameter_SVM', 0, 10  ), #choose some number between 0 to 3
        'kernel' : hp.choice('penalty type name_SVM', ['rbf', 'poly', "sigmoid"]), #adding options to choose from for kernel parameter
        'degree' : hp.choice('degree', [1,2,3,4]),
        "gamma" : hp.choice("gamma value", ["scale", "auto"]),
        },

        {
        'model_type' : "logit",
        'C' : hp.uniform(  'alpha parameter_logit', 0, 10  ), #choose some number between 0 to 3
        'penalty' : hp.choice('penalty type name_logit', ['l1', 'l2']), #adding 2 options to choose from for penalty parameter
        'solver' : hp.choice('solver', ['liblinear']) #choose one of the  solver algorithms
        },

        {
            'model_type' : "dtree",
            'criterion' : hp.choice('criteria_tree', ['gini', 'entropy', 'log_loss']),
            'splitter' : hp.choice('splitter_tree', ['best', 'random']),
            'max_depth' : hp.choice('depth_tree', [x for x in range(1,12,1)])
        },

        {
            'model_type' : "random_forest",
            'criterion' : hp.choice('criteria_forest', ['gini', 'entropy', 'log_loss']),
            'max_depth' : hp.choice('depth_forest', [x for x in range(1,12,1)]),
            'n_estimators': hp.choice('estimator_count_forest', [x for x in range(5,125,20)]), 
            'max_features' : hp.choice('feature_count_forest', ['sqrt', 'log2'])
        }
       
    ]
)


### steps

a) Accept the entire search space as a parameter. One algorithm will be passed to the model at a time from this space

b) find out which algorithm is being picked in the current iteration by reading the model_type entry of the parameter set.

c) Since model_type is not a parameter for any ML algorithm class in sklearn, delete it before passing the parameter dictionary to your model

d) use a if condition check to run code according to selected algorithm

In [12]:
scores=[]
def objective(space):
    
    algo = space['model_type'] #step b
    del space['model_type']

    if  algo == "logit":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = LogisticRegression(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if algo == "SVM":
        features = real_value_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )
        model = SVC(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')


        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if  algo == "dtree":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = DecisionTreeClassifier(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}

    if  algo == "random_forest":
        features = real_value_features + categorical_features
        target = 'loan_status'

        X_train, X_test, y_train, y_test = train_test_split(  
        df[features],df[target],
        test_size=0.4,
        random_state=10,
        stratify = df[target]
        )

        model = RandomForestClassifier(**space)

        model.fit(X_train, y_train)

        pred = model.predict(X_test)

        accuracy = f1_score(y_test,pred, average='weighted')
        scores.append(accuracy)
        return {'loss': -accuracy,'status':STATUS_OK, "algo": algo}
    

In [13]:
trials = Trials() #create a database (in-memory)

      #fmin is the function to be used for minimum optimization
best = fmin(fn=objective, #function
            space=space, #search space parameters
            algo=tpe.suggest,
            early_stop_fn=no_progress_loss(  iteration_stop_count=100 ), 
            max_evals=1000,
            trials=trials)


print (space_eval(space, best))

 11%|█         | 112/1000 [01:26<11:24,  1.30trial/s, best loss: -0.8484897490087435]
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'model_type': 'random_forest', 'n_estimators': 25}
