In [5]:
#importing the necessary library
import pandas as pd

In [6]:
# loading the train and test dataset
path = 'https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_train.csv'

In [7]:
# Now reading the data
data = pd.read_csv(path)

In [8]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Checking the infomation of the dataset given


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# The summary of the data

In [10]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# from the summary, there are some features with missing values, hence need to be replaced or removed, but they are being replaced

In [11]:
#Taking care of missing values
# since there are nan values we have to handle each accordingly
data['Gender'].mode() #since the mode is male, we fill the nan values with male
data['Gender'].fillna('male',inplace=True)

# FOR Married
data['Married'].mode()
data['Married'].fillna('Yes',inplace=True)

# FOR Self_Employed
data['Self_Employed'].mode()
data['Self_Employed'].fillna('Yes',inplace=True)

data['Dependents'] = data['Dependents'].replace(['3+'],3)
data['Dependents'].fillna(0,inplace=True)
data['Dependents'].fillna(0,inplace=True)

# noow filling all missing numerical values 
data.fillna(data.mean(),inplace=True)

# The missing values are taken care of, now more categorical data

In [12]:
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# change categorical data to numerical

In [13]:

# gender, married, education,self_employ - label encoder is used because of the nature of the features, it either 0 or 1
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

In [14]:
#gender
data.Gender = le.fit_transform(data.Gender)

#marriage
data.Married = le.fit_transform(data.Married)

#employment
data.Self_Employed = le.fit_transform(data.Self_Employed)

#education
data.Education = le.fit_transform(data.Education)
                                  
#Loan_Status
data.Loan_Status = le.fit_transform(data.Loan_Status)

In [15]:
#oneHotEncoder is used here for the data that is not categorical
ohe = OneHotEncoder()
area = ohe.fit_transform(data[['Property_Area']]).toarray()
a_frame = pd.DataFrame(area)



data_new = pd.concat([data,a_frame],axis=1)

In [16]:
data_new

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,0,1,2
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Urban,1,0.0,0.0,1.0
1,LP001003,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,Rural,0,1.0,0.0,0.0
2,LP001005,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
3,LP001006,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
4,LP001008,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,Rural,1,1.0,0.0,0.0
610,LP002979,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,Rural,1,1.0,0.0,0.0
611,LP002983,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
612,LP002984,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,Urban,1,0.0,0.0,1.0


# splitting data to x as independent and y as the target

In [17]:
X = data_new.loc[:,['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History',0,1,2]]
y = data['Loan_Status']

In [18]:
 X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,0,1,2
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,0.0,0.0,1.0
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,1.0,0.0,0.0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,0.0,0.0,1.0
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,0.0,0.0,1.0
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,1.0,0.0,0.0
610,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,1.0,0.0,0.0
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,0.0,0.0,1.0
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,0.0,0.0,1.0


# Test and train

In [19]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X, y,train_size=0.5,test_size=0.5,random_state=123)

In [20]:
#Preprocessing the data with the minMaxScalar
from sklearn.preprocessing import scale,MinMaxScaler
min_max = MinMaxScaler()


In [21]:
x_train_minMax = min_max.fit_transform(train_X)
x_test_minMax = min_max.fit_transform(test_X)

# applying the first Algorithm, "Decision Tree"

In [22]:
#importing the decision tree classification model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini',max_depth=None)

In [23]:
tree.fit(x_train_minMax,train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [24]:
prediction = tree.predict(x_test_minMax)

In [25]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_y,prediction))

[[ 40  63]
 [ 54 150]]


In [26]:
param_dist = {'criterion': ['gini','entropy'], 'max_depth': [1,2,3,4,5,6,7,None]}

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid = GridSearchCV(tree, param_grid=param_dist, cv=10, n_jobs=-1) 

In [29]:
grid.fit(x_train_minMax,train_y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, None]},
          

# Parameter with best  results output

In [30]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 1}

In [31]:
#the best parameter pair is gini and depth:1

In [32]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [33]:
grid_predictions = grid.predict(x_test_minMax)

# The accuracy of the decision tree algo

In [34]:
print(confusion_matrix(test_y,grid_predictions),'\n with accuracy ',grid.best_score_)

[[ 43  60]
 [  4 200]] 
 with accuracy  0.8273615635179153


In [35]:
#The accuracy is 0.8274 to 4 decimal places

# Using KNN  ALGORITHM

In [36]:
# importing the algorithm from sklearn of scikitlearn
from sklearn.neighbors import KNeighborsClassifier

Knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [37]:
Knn.fit(train_X,train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [38]:
K_prediction = Knn.predict(test_X)

In [39]:
print(confusion_matrix(test_y,K_prediction))

[[ 12  91]
 [ 29 175]]


In [40]:
param_distK = {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']} 

In [41]:
K_grid = GridSearchCV(KNeighborsClassifier(), param_distK, verbose = 1, cv=3, n_jobs = -1)

In [42]:
K_grid.fit(train_X,train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    0.2s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [5, 7, 9, 11, 13, 15],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [43]:
K_grid.best_params_

{'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}

In [44]:
#from the above the best parameter is n_neighbors:11

In [45]:
K_grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [46]:
grid_predictionsK = K_grid.predict(test_X)

In [47]:
print(confusion_matrix(test_y,grid_predictionsK),'\n with accuracy ',K_grid.best_score_)

[[ 10  93]
 [ 18 186]] 
 with accuracy  0.742671009771987


In [48]:
#The accuracy is 0.7426 to 4 decimal places

# Logistic regression algorithm

In [49]:
import numpy as np
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression()

In [50]:
logReg.fit(train_X,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
Reg_prediction = logReg.predict(test_X)

In [52]:
print(confusion_matrix(test_y,Reg_prediction))

[[ 44  59]
 [  4 200]]


In [53]:
param_dist_reg = {'random_state':[42],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }

In [54]:
reg_grid = GridSearchCV(logReg, param_grid = param_dist_reg, verbose = 1, cv=3, n_jobs = -1)

In [55]:
reg_grid.fit(train_X,train_y)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 1193 out of 1200 | elapsed:   16.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   16.0s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.00000000e...
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'max_iter': [100, 1000, 2500, 5000],
                

In [56]:
#best parameter

In [57]:
reg_grid.best_params_ 

{'C': 0.615848211066026,
 'max_iter': 100,
 'random_state': 42,
 'solver': 'newton-cg'}

In [58]:
reg_grid.best_estimator_

LogisticRegression(C=0.615848211066026, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
grid_predictionsReg = reg_grid.predict(test_X)

In [60]:
print(confusion_matrix(test_y,grid_predictionsReg),'\n with accuracy ',reg_grid.best_score_)

[[ 44  59]
 [  4 200]] 
 with accuracy  0.8241042345276873


# The SVM algorithm

In [61]:
from sklearn.svm import SVC
svm_model = SVC(C= 1, gamma= 0.01, kernel= 'rbf', random_state=42)

In [62]:
svm_model.fit(train_X,train_y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [63]:
prediction_svm = svm_model.predict(test_X)

In [64]:
print(confusion_matrix(test_y,prediction_svm))

[[  0 103]
 [  0 204]]


In [65]:
param_grid_svm = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [66]:
grid_svm = GridSearchCV(SVC(),param_grid_svm,refit=True,verbose=3)

In [67]:
grid_svm.fit(train_X,train_y)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.713, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.713, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.713, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.709, total=   0.0s
[CV] C=0.1, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=0.1, gamma=0.0001, kernel=rbf, score=0.713, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.709, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.713, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

# Parameter with best results

In [68]:
grid_svm.best_params_  

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

In [69]:
#Parameter with highest result
grid_svm.best_estimator_ 

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [70]:
grid_prediction_svm = grid.predict(test_X)

In [71]:
print(confusion_matrix(test_y,grid_prediction_svm),'\n with accuracy ',grid_svm.best_score_)

[[ 43  60]
 [  4 200]] 
 with accuracy  0.7166123778501629


# Random forest algo

In [72]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state =0, n_estimators=100)

In [73]:
rf_model.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [74]:
prediction_rf = rf_model.predict(test_X)

In [75]:
print(confusion_matrix(test_y,prediction_rf))

[[ 43  60]
 [ 11 193]]


In [76]:
param_grid_rf = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],

 'n_estimators': [200, 400, 600]}

In [77]:
grid_rf = GridSearchCV(estimator = rf_model, param_grid = param_grid_rf,  cv = 3, verbose=2)

In [78]:
grid_rf.fit(train_X,train_y)

Fitting 3 folds for each of 66 candidates, totalling 198 fits
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.2s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.2s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=200, total=   0.2s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.3s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.4s
[CV] bootstrap=True, max_depth=10, n_estimators=400 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=400, total=   0.4s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=600, total=   0.6s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] ... bootstrap=True, max_depth=10, n_estimators=600, total=   0.5s
[CV] bootstrap=True, max_depth=10, n_estimators=600 ..................
[CV] .

[Parallel(n_jobs=1)]: Done 198 out of 198 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'boot

In [79]:
# Parameter setting that gave the best results
grid_rf.best_params_  

{'bootstrap': True, 'max_depth': 10, 'n_estimators': 600}

In [80]:
 # estimator which gave highest score
grid_rf.best_estimator_ 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=600,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [81]:
grid_prediction_rf = grid_rf.predict(test_X)

In [82]:
print(confusion_matrix(test_y,grid_prediction_rf),'\n with accuracy ',grid_rf.best_score_)

[[ 43  60]
 [  6 198]] 
 with accuracy  0.8208469055374593


# Linear reg algo using sgd

# Using SGD model

In [89]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier(random_state=0, class_weight='balanced')

In [90]:
SGD.fit(train_X,train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=0, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [91]:
prediction_SDG = SGD.predict(test_X)

In [92]:
print(confusion_matrix(test_y,prediction_SDG))

[[100   3]
 [201   3]]


In [93]:
param_grid_SDG = {'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1],}

In [94]:
grid_SDG = GridSearchCV(estimator=SGD, param_grid=param_grid_SDG,
                                    n_jobs=-1, scoring='roc_auc')

In [95]:
grid_SDG.fit(train_X,train_y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight='balanced',
                                     early_stopping=False, epsilon=0.1,
                                     eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=0,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95,

In [97]:
# best results results
grid_SDG.best_params_  

{'alpha': 1e-06, 'l1_ratio': 0}

In [98]:
grid_SDG.best_estimator_  

SGDClassifier(alpha=1e-06, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0, learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=0, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [99]:
grid_prediction_SDG = grid_SDG.predict(test_X)

In [100]:
print(confusion_matrix(test_y,grid_prediction_SDG),'\n with accuracy ',grid_SDG.best_score_)

[[ 91  12]
 [171  33]] 
 with accuracy  0.5304114008395234


# End of Exam