In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df=pd.read_csv(r'E:\Hero\Data\data 2.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,DateOfBirth
0,HF001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,04-Jan-1985
1,HF001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,25-Oct-1989
2,HF001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,20-Aug-1986
3,HF001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,18-Aug-1996
4,HF001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,01-Jun-1984


In [3]:
df['DateOfBirth']=pd.to_datetime(df['DateOfBirth'],format='%d-%b-%Y')

In [4]:
df['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [5]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,DateOfBirth
0,HF001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1985-01-04
1,HF001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1989-10-25
2,HF001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1986-08-20
3,HF001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1996-08-18
4,HF001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1984-06-01


In [6]:
df['DayOfBirth']=df['DateOfBirth'].dt.day
df['MonthOfBirth']=df['DateOfBirth'].dt.month
df['YearOfBirth']=df['DateOfBirth'].dt.year

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Loan_ID            614 non-null    object        
 1   Gender             601 non-null    object        
 2   Married            611 non-null    object        
 3   Dependents         599 non-null    object        
 4   Education          614 non-null    object        
 5   Self_Employed      582 non-null    object        
 6   ApplicantIncome    614 non-null    int64         
 7   CoapplicantIncome  614 non-null    float64       
 8   LoanAmount         592 non-null    float64       
 9   Loan_Amount_Term   600 non-null    float64       
 10  Credit_History     564 non-null    float64       
 11  Property_Area      614 non-null    object        
 12  Loan_Status        614 non-null    object        
 13  DateOfBirth        614 non-null    datetime64[ns]
 14  DayOfBirth

In [8]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
DateOfBirth           0
DayOfBirth            0
MonthOfBirth          0
YearOfBirth           0
dtype: int64

* `Gender` , `Married`,  `Dependents` , `Self_Employed`,  `LoanAmount`,  `Loan_Amount_Term`, `Credit_History` have some missing values.
* 

In [9]:
X=df.drop(['Loan_Status','DateOfBirth','Loan_ID'],axis=1)
y=df['Loan_Status']

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

## **Segregating Numerical and Categorical Columns:-**

In [11]:
categorical_column=X.select_dtypes(include='object').columns
numerical_column=X.select_dtypes(exclude='object').columns


In [12]:
X.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'DayOfBirth',
       'MonthOfBirth', 'YearOfBirth'],
      dtype='object')

In [13]:
categorical_column

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

In [14]:
numerical_column

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'DayOfBirth', 'MonthOfBirth',
       'YearOfBirth'],
      dtype='object')

In [15]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder()),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_column),
('cat_pipeline',cat_pipeline,categorical_column)
])


In [16]:
preprocessor

In [17]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [18]:
X_train.head()

Unnamed: 0,num_pipeline__ApplicantIncome,num_pipeline__CoapplicantIncome,num_pipeline__LoanAmount,num_pipeline__Loan_Amount_Term,num_pipeline__Credit_History,num_pipeline__DayOfBirth,num_pipeline__MonthOfBirth,num_pipeline__YearOfBirth,cat_pipeline__Gender,cat_pipeline__Married,cat_pipeline__Dependents,cat_pipeline__Education,cat_pipeline__Self_Employed,cat_pipeline__Property_Area
0,-0.649376,10.229235,-0.624387,0.290885,0.430213,-0.685008,-0.503164,-0.636964,0.493435,-1.406829,-0.715002,-0.536925,-0.399324,1.2635
1,-0.680639,0.718413,0.655044,0.290885,-2.324432,0.93474,1.583855,0.305954,0.493435,0.710819,-0.715002,1.862458,-0.399324,-0.008898
2,-0.497338,-0.040624,-0.35928,-2.498754,-2.324432,1.397525,-1.099456,-0.071213,0.493435,0.710819,0.290689,1.862458,-0.399324,-1.281296
3,-0.27533,-0.517007,-0.382333,0.290885,0.430213,0.471954,-1.099456,0.494538,-2.026609,-1.406829,-0.715002,-0.536925,-0.399324,-0.008898
4,0.553525,-0.517007,0.09025,-0.638994,-2.324432,-0.222223,0.987564,1.437456,0.493435,0.710819,0.290689,-0.536925,-0.399324,-0.008898


In [19]:
X_test.head()

Unnamed: 0,num_pipeline__ApplicantIncome,num_pipeline__CoapplicantIncome,num_pipeline__LoanAmount,num_pipeline__Loan_Amount_Term,num_pipeline__Credit_History,num_pipeline__DayOfBirth,num_pipeline__MonthOfBirth,num_pipeline__YearOfBirth,cat_pipeline__Gender,cat_pipeline__Married,cat_pipeline__Dependents,cat_pipeline__Education,cat_pipeline__Self_Employed,cat_pipeline__Property_Area
0,-0.273097,-0.517007,-0.785757,0.290885,0.430213,-0.800704,-1.099456,-1.768467,0.493435,-1.406829,-0.715002,-0.536925,-0.399324,-0.008898
1,0.838804,0.065133,0.436042,-2.498754,0.430213,0.356258,-1.695747,-0.448381,0.493435,0.710819,0.290689,-0.536925,2.504234,1.2635
2,-0.40057,-0.517007,-0.80881,0.290885,0.430213,-0.685008,0.689418,0.871705,0.493435,0.710819,2.302071,1.862458,-0.399324,-0.008898
3,-0.299336,0.41194,0.078723,0.290885,0.430213,-0.685008,0.987564,-0.448381,0.493435,0.710819,0.290689,-0.536925,-0.399324,-0.008898
4,0.092202,-0.517007,-0.001961,0.290885,0.430213,1.628917,0.987564,-0.825548,0.493435,0.710819,-0.715002,-0.536925,-0.399324,1.2635


In [20]:
y_train.value_counts()

Loan_Status
Y    288
N    141
Name: count, dtype: int64

* Here our Output value is inbalenced form


## **Perform SMOTE Technique:-**

In [21]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

In [22]:
X_train.shape

(429, 14)

In [23]:
X_train,y_train=oversample.fit_resample(X_train,y_train)

In [24]:
X_train.shape

(576, 14)

In [25]:
y_train.value_counts()

Loan_Status
N    288
Y    288
Name: count, dtype: int64

## **Define models:-**

In [26]:
models={
    'LogisticRegression':LogisticRegression(),
    'SVC':SVC(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(algorithm='auto',p=2)
}

## **check parameters:-**

In [27]:
params={
    "LogisticRegression": {
        'penalty':['l1', 'l2', 'elasticnet', None],
        'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    },
    
    
    "SVC":{
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'],      
    },
    
    
    'DecisionTreeClassifier':{
        'criterion':['gini', 'entropy', 'log_loss'],
        'max_depth':[5,10,15,20,25,30,35,40],
        'splitter':['best','random']
        },
    
    
    
    'RandomForestClassifier':{
        'n_estimators': [10,20,30,40,50,60,70,80],
        'criterion':['gini', 'entropy', 'log_loss'],
        'max_depth':[5,10,15,20,25]
        
    },
    
    'GradientBoostingClassifier':{
        'loss':['log_loss','exponential'],
        'n_estimators':[50,100,150],
        'criterion':['friedman_mse','squared_error'],
        'learning_rate':[.1,.01,.05,.001]
    },
    
    
    
    'AdaBoostClassifier':{
        'n_estimators':[50,100,150,200],
        'algorithm':['SAMME', 'SAMME.R'],
        'learning_rate':[.1,.01,.05,.001]  
    },
    
    
    
    
    'KNeighborsClassifier':{
        'n_neighbors':[3,4,5,6,7,8]
        
        
    }
}

In [28]:
from sklearn.metrics import accuracy_score

### **Check all Models with given 75% accuracy**

In [29]:
from sklearn.model_selection import GridSearchCV
def get_best_model(X_train,y_train,X_test,y_test,model_dict,params_,base_accuracy=0.75):
    models=model_dict.keys()
    for model in models:
        print(f'Intial base accuracy is {base_accuracy}\n')  
        print(20*'*','Estimator:',model,20*'*')
        gs=GridSearchCV(estimator=model_dict[model],param_grid=params_[model],cv=3)
        gs.fit(X_train,y_train)
        print(f'Best parameters are : {gs.best_params_} With accuracy score: {gs.best_score_}')
        
        classifier=model_dict[model].set_params(**gs.best_params_)
        classifier.fit(X_test,y_test)
        
        y_train_pred=classifier.predict(X_train)
        y_test_pred=classifier.predict(X_test)
        
        train_model_score = accuracy_score(y_train, y_train_pred)
        test_model_score = accuracy_score(y_test, y_test_pred)
        
        if test_model_score >= base_accuracy:
            print('Acceptabel model found!')
            base_accuracy = test_model_score
            best_model = {model:classifier}
        else:
            print('Model rejected\n',30*'__')
        
    print(f"Best model is:{list(best_model.keys())} with accuracy score: {(test_model_score,2)}")        
    return best_model
        

In [30]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
get_best_model(X_train,y_train,X_test,y_test,model_dict=models,params_=params)

Intial base accuracy is 0.75

******************** Estimator: LogisticRegression ********************
Best parameters are : {'penalty': 'l1', 'solver': 'saga'} With accuracy score: 0.7204861111111112
Acceptabel model found!
Intial base accuracy is 0.827027027027027

******************** Estimator: SVC ********************
Best parameters are : {'kernel': 'rbf'} With accuracy score: 0.75
Acceptabel model found!
Intial base accuracy is 0.8432432432432433

******************** Estimator: DecisionTreeClassifier ********************
Best parameters are : {'criterion': 'entropy', 'max_depth': 15, 'splitter': 'random'} With accuracy score: 0.796875
Acceptabel model found!
Intial base accuracy is 1.0

******************** Estimator: RandomForestClassifier ********************
Best parameters are : {'criterion': 'gini', 'max_depth': 15, 'n_estimators': 70} With accuracy score: 0.859375
Acceptabel model found!
Intial base accuracy is 1.0

******************** Estimator: GradientBoostingClassifie

{'GradientBoostingClassifier': GradientBoostingClassifier(loss='exponential', n_estimators=150)}

In [32]:
model=RandomForestClassifier(criterion='gini',max_depth=20,n_estimators=80)

In [33]:
model.fit(X_train,y_train)

In [34]:
y_pred=model.predict(X_test)

In [35]:
y_pred

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [36]:
print(accuracy_score(y_test,y_pred))

0.772972972972973


## **Check on New Data:-**

In [37]:
df1=pd.read_csv(r'E:\Hero\Data\data 1.csv')
df1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,DateOfBirth
0,HF001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,15-Jun-1992
1,HF001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,28-Sep-1993
2,HF001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,25-Nov-1991
3,HF001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban,03-Jun-1984
4,HF001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,22-Mar-1982


In [38]:
df1['DateOfBirth']=pd.to_datetime(df1['DateOfBirth'],format='%d-%b-%Y')
df1['DayOfBirth']=df1['DateOfBirth'].dt.day
df1['MonthOfBirth']=df1['DateOfBirth'].dt.month
df1['YearOfBirth']=df1['DateOfBirth'].dt.year

In [39]:
x=df1.drop(['Loan_ID','DateOfBirth'],axis=1)

In [40]:
x.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,DayOfBirth,MonthOfBirth,YearOfBirth
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,15,6,1992
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,28,9,1993
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,25,11,1991
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban,3,6,1984
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,22,3,1982


In [41]:
x_check=pd.DataFrame(preprocessor.transform(x),columns=preprocessor.get_feature_names_out())

In [42]:
x_check

Unnamed: 0,num_pipeline__ApplicantIncome,num_pipeline__CoapplicantIncome,num_pipeline__LoanAmount,num_pipeline__Loan_Amount_Term,num_pipeline__Credit_History,num_pipeline__DayOfBirth,num_pipeline__MonthOfBirth,num_pipeline__YearOfBirth,cat_pipeline__Gender,cat_pipeline__Married,cat_pipeline__Dependents,cat_pipeline__Education,cat_pipeline__Self_Employed,cat_pipeline__Property_Area
0,0.073407,-0.517007,-0.393859,0.290885,0.430213,-0.106527,-0.205019,0.117370,0.493435,0.710819,-0.715002,-0.536925,-0.399324,1.263500
1,-0.418621,-0.040624,-0.209437,0.290885,0.430213,1.397525,0.689418,0.305954,0.493435,0.710819,0.290689,-0.536925,-0.399324,1.263500
2,-0.060580,0.054653,0.735728,0.290885,0.430213,1.050436,1.285710,-0.071213,0.493435,0.710819,1.296380,-0.536925,-0.399324,1.263500
3,-0.555585,0.291574,-0.509123,0.290885,0.430213,-1.494882,-0.205019,-1.391299,0.493435,0.710819,1.296380,-0.536925,-0.399324,1.263500
4,-0.381403,-0.517007,-0.762704,0.290885,0.430213,0.703347,-1.099456,-1.768467,0.493435,-1.406829,-0.715002,1.862458,-0.399324,1.263500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,-0.244997,0.047348,-0.359280,0.290885,0.430213,1.050436,0.391273,1.248872,0.493435,0.710819,2.302071,1.862458,2.504234,1.263500
363,-0.217269,-0.291836,-0.336227,0.290885,0.430213,0.934740,-0.205019,1.437456,0.493435,0.710819,-0.715002,-0.536925,-0.399324,1.263500
364,-0.386241,0.115947,-0.209437,0.290885,0.430213,-0.106527,-0.503164,0.305954,0.493435,-1.406829,-0.715002,-0.536925,-0.399324,-0.008898
365,-0.060580,0.242983,0.159408,0.290885,0.430213,1.628917,-1.099456,0.871705,0.493435,0.710819,-0.715002,-0.536925,-0.399324,-1.281296


In [43]:
model.predict(x_check)

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

### **Save the estimator:-**

In [44]:
import pickle
file=open('model.pkl','wb')
pickle.dump(model,file)
file=open('preprocessor.pkl','wb')
pickle.dump(preprocessor,file)