In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts

In [2]:
dataset = pd.read_csv("loan_approval_dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
dataset.shape

(4269, 13)

In [5]:
dataset.nunique()

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

In [6]:
dataset.duplicated().sum()


0

In [7]:
dataset = dataset.drop(columns=['loan_id'])

In [8]:
dataset.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [9]:
dataset.columns = [c.strip() for c in dataset.columns]
dataset.columns

Index(['no_of_dependents', 'education', 'self_employed', 'income_annum',
       'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
       'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
       'loan_status'],
      dtype='object')

In [10]:
x = dataset.drop(columns=['loan_status'])
y = dataset['loan_status']

In [11]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   no_of_dependents          4269 non-null   int64 
 1   education                 4269 non-null   object
 2   self_employed             4269 non-null   object
 3   income_annum              4269 non-null   int64 
 4   loan_amount               4269 non-null   int64 
 5   loan_term                 4269 non-null   int64 
 6   cibil_score               4269 non-null   int64 
 7   residential_assets_value  4269 non-null   int64 
 8   commercial_assets_value   4269 non-null   int64 
 9   luxury_assets_value       4269 non-null   int64 
 10  bank_asset_value          4269 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 367.0+ KB


In [12]:
# Apply the domain Knowledge
## create one assets value for combining all assets
x['total_assets_value'] = x['residential_assets_value'] + x['commercial_assets_value'] + x['luxury_assets_value'] + x['bank_asset_value']

x = x.drop(columns=['bank_asset_value','luxury_assets_value','commercial_assets_value','residential_assets_value'])

In [13]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

In [14]:
x['education'] = [c.strip() for c in x['education']]
x['self_employed'] = [c.strip() for c in x['self_employed']]
y = y.str.strip()



In [15]:
x['education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [16]:
# Convert the above column in numerical value
def transform_to_binary(df, column_to_transform):
    for column_name, positive_value in column_to_transform.items():
        df[column_name] = df[column_name].apply(lambda x: 1 if x is positive_value else 0)
        
    return df


colum_to_transform = {
    'education': ['Graduate'],
  'self_employed': ['Yes']
}


transformed_x = transform_to_binary(x,colum_to_transform)

In [17]:
x.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,0,0,9600000,29900000,12,778,50700000
1,0,0,0,4100000,12200000,8,417,17000000
2,3,0,0,9100000,29700000,20,506,57700000
3,3,0,0,8200000,30700000,8,467,52700000
4,5,0,0,9800000,24200000,20,382,55000000


In [18]:
# Log transformation
log_cols = ['income_annum', "loan_amount", "total_assets_value"]
x[log_cols]  = np.log(x[log_cols])

In [19]:
y = y.map({'Approved': 1, "Rejected": 0})

In [20]:
y.head()

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [22]:
#Split the daatset
x_test, x_train, y_test, y_train = tts(x, y, test_size=0.3, random_state=10)

In [23]:
# Training Model 
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
y_pred_test = log.predict(x_test)

In [25]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test)
print(acc)

0.9086345381526104


# Serialization and Deserialization

In [26]:
import joblib
joblib.dump(log, "my_trained_model.pkl")

['my_trained_model.pkl']

In [27]:
final_model = joblib.load("my_trained_model.pkl")

In [28]:
final_model.intercept_, final_model.coef_

(array([-12.70675546]),
 array([[-0.01851206,  0.        ,  0.        , -2.32082842,  1.73692308,
         -0.15659762,  0.02294062,  0.55594001]]))

In [29]:
log.intercept_, log.coef_

(array([-12.70675546]),
 array([[-0.01851206,  0.        ,  0.        , -2.32082842,  1.73692308,
         -0.15659762,  0.02294062,  0.55594001]]))