### note: To run the code, a library called hyperopt must be installed

###  pip install hyperopt

### Use cases for HyperOpt

### 1) Hyper parameters optimization!
### 2) Creating a pipeline of execution of ML models

In [73]:
import pandas as pd
import plotly.express as pe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,f1_score,roc_auc_score,accuracy_score
from hyperopt import hp,tpe,fmin,Trials,STATUS_OK,space_eval
import numpy as np


## Step 1 : Gather the data

In [74]:
path = "/home/harshit/Desktop/TataSteelML2023/dataset/Loan_Status_Classification.csv"
df = pd.read_csv(path)
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,0,0,6608,0,137,180,1,1,1
1,0,1,2,0,0,4226,1040,110,360,1,1,1
2,1,1,0,1,0,3167,2283,154,360,1,2,1
3,0,0,0,1,1,6950,0,175,180,1,2,1
4,0,1,0,1,0,3993,3274,207,360,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
293,1,0,0,1,0,3846,0,111,360,1,2,1
294,0,0,0,1,0,2435,0,75,360,1,1,0
295,0,0,2,1,0,4923,0,166,360,0,2,1
296,0,1,3,0,0,2071,754,94,480,1,2,1


## step 2 : Exploratory Data Analysis (EDA)

In [75]:
df.shape

(298, 12)

In [76]:
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [77]:
df.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      257
CoapplicantIncome    150
LoanAmount           145
Loan_Amount_Term       9
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [78]:
df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount' ]].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,298.0,298.0,298.0
mean,5351.265101,1673.026846,143.560403
std,6306.080712,2892.404818,80.395182
min,150.0,0.0,9.0
25%,2883.75,0.0,99.25
50%,3854.0,1106.0,125.5
75%,5721.5,2281.0,171.5
max,81000.0,33837.0,600.0


In [79]:
sc = StandardScaler()
#for every column that is a feature.
    ## Take the column
    ## Apply Z score adjustment for the values in the column
    ## Overwrite the same column in df
for col in ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount' ]:
    df[[col]]   =  sc.fit_transform(df[[col]])

df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,0,0,0.199625,-0.579394,-0.081739,180,1,1,1
1,0,1,2,0,0,-0.178741,-0.219226,-0.418145,360,1,1,1
2,1,1,0,1,0,-0.346957,0.211243,0.130072,360,1,2,1
3,0,0,0,1,1,0.253949,-0.579394,0.391721,180,1,2,1
4,0,1,0,1,0,-0.215752,0.554440,0.790424,360,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
293,1,0,0,1,0,-0.239102,-0.579394,-0.405686,360,1,2,1
294,0,0,0,1,0,-0.463231,-0.579394,-0.854227,360,1,1,0
295,0,0,2,1,0,-0.068027,-0.579394,0.279586,360,0,2,1
296,0,1,3,0,0,-0.521050,-0.318272,-0.617497,480,1,2,1


In [80]:
pd.crosstab(index=df['Gender'], columns=df['Loan_Status'] )

Loan_Status,0,1
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0,116,129
1,32,21


# Encoding must be done if not already. Also choose right type of encoding!!!

# Step 3 : Extracting features & columns

In [81]:

#list comprehension! : all columns apart from Loan_Status
features = df [ [col for col in df.columns if col != "Loan_Status"]].copy()

#target is Loan_Status
label = df['Loan_Status'].copy()

#### step 4 : Separate dataset into training and testing test

In [82]:
X_train, X_test, y_train, y_test = train_test_split(  
    features,label,
    test_size=0.2,
    random_state=10,
    stratify = label
)

#### HYPEROPT!!

Age     Income   Loan_Amount     Status(target column)

0.1   0.6  0.3 (weights)


L1 regularization : It helps the model to elimiate non-important features completely from the process of generating result.
    Advantages:
        1) Reduce features to be used for final answers.
        2) Identify how important a feature is to the end result

L2 regularization: Solves the problem of overfitting (model should not show great performance in training and significantly become inaccurate during testing)

C (learning rate???)



Age: less important
Income & Loan_Amount is more important.

If model is able to do this, accuracy of the model

### HYPEROPT: 

    #1) Create a search space: A set of values to be TRIED for various parameters
    #2) Create an objective function that will work on minimization principle to find the best model
    #3) Apply the objective function on the search space

Note:

    a)Keys need to match with parameters to be adjusted
    b) if there are options to choose from (a set of values), use hp.choice function
        but
        if there is a range of values in mind for a certain parameter choose hp.uniform

In [83]:
#keys of this dictionary are names of parameters
space ={

    'C' : hp.uniform(  'alpha parameter', 0, 10  ), #choose some number between 0 to 3
    'penalty' : hp.choice('penalty type name', ['l1', 'l2']), #adding 2 options to choose from for penalty parameter
    'solver' : hp.choice('solver', ['liblinear']) #choose one of the  solver algorithms

}

In [85]:
scores=[]
def objective(space):
    model = LogisticRegression(**space)

    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    accuracy = accuracy_score(y_test,pred)


    scores.append(accuracy)
    return {'loss': -accuracy,'status':STATUS_OK}

In [86]:
trials = Trials() #create a database (in-memory)

      #fmin is the function to be used for minimum optimization
best = fmin(fn=objective, #function
            space=space, #search space parameters
            algo=tpe.suggest,
            max_evals=200,
            trials=trials)


print (space_eval(space, best))

100%|██████████| 200/200 [00:02<00:00, 80.24trial/s, best loss: -0.6833333333333333]
{'C': 0.17497093961929155, 'penalty': 'l1', 'solver': 'liblinear'}
