#### Build model for prediction

In [337]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier


from sklearn import set_config
set_config(display='diagram')


import pickle

In [303]:
df = pd.read_csv("/Users/hople/Desktop/Bootcamp Lectures/week_7/mini-project-4/data/cleaned_data.csv")
df.drop(columns='Unnamed: 0', inplace=True)
df.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,Male,No,0,Graduate,No,146,360.0,1,Urban,1,5849
1,Male,Yes,1,Graduate,No,128,360.0,1,Rural,0,6091
2,Male,Yes,0,Graduate,Yes,66,360.0,1,Urban,1,3000
3,Male,Yes,0,Not Graduate,No,120,360.0,1,Urban,1,4941
4,Male,No,0,Graduate,No,141,360.0,1,Urban,1,6000


In [330]:
#get data and target
X, y = df.drop(columns='Loan_Status'), df['Loan_Status'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [331]:
#Log function
def log_transform(x):
    return np.log(x + 1)

# Labelencoder to Gender_transform, Married_transform, Self_Eployed_transform, Edu_transform(x):
    
def Loan_term(X):
    X = X/12
    return X.astype(int)

def Dependents_transform(X):
    col = 'Dependents'
    conditions  = [X == '0', X == '3+']
    
    X = np.select(conditions, [0,2], default=1)
    
    return X

class CustomLabelEncode(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X ,y=None):
        le=LabelEncoder()
        for i in X[label_cols]:
            X[i]=le.fit_transform(X[i])
        return X

In [332]:
#extract columns
label_cols = ['Gender','Married', 'Self_Employed', 'Education', 'Property_Area']
log_scale_cols = ['LoanAmount', 'Total_Income']
depep_cols = ['Dependents']
loan_cols = ['Loan_Amount_Term']

In [333]:
#lb = ModifiedLabelEncoder()
log_transformer = FunctionTransformer(log_transform)
loan_transformer = FunctionTransformer(Loan_term)
dependents_transformer = FunctionTransformer(Dependents_transform)
sc = StandardScaler()
selection = SelectKBest(k=4)


In [339]:
# Preprocessing pipeline
num_pipe = Pipeline([
  ('log_feats', log_transformer),
  ('scaler', sc)
])


cols_transform = ColumnTransformer([
    ('num_transform', num_pipe, log_scale_cols),
    ('dependents_transforme', dependents_transformer, depep_cols),
    ('loan_transform', loan_transformer, loan_cols),
    ('label_encoder', CustomLabelEncode(), label_cols)
],remainder='passthrough')


feature_union = FeatureUnion([('pca', PCA()),
                              ('select_best', SelectKBest())])

In [345]:
main_pipeline = Pipeline(steps=[('preprocessing', cols_transform),
                                ('features', feature_union),
                                ('classifier', RidgeClassifier())])

# Find the best hyperparameters using GridSearchCV on the train set
param_grid = {'classifier__alpha': [0.001, 0.003, 0.01, 0.1], 
              'features__pca__n_components': [3,5,7],
              'features__select_best__k': [1,3,4,6]}

grid = GridSearchCV(main_pipeline, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)


best_model = grid.best_estimator_
best_hyperparams = grid.best_params_
best_acc = grid.score(X_test, y_test)
print(f'Best test set accuracy:\n\t {best_acc}\nAchieved with hyperparameters:\n\t {best_hyperparams}')

Best test set accuracy:
	 0.7642276422764228
Achieved with hyperparameters:
	 {'classifier__alpha': 0.001, 'features__pca__n_components': 3, 'features__select_best__k': 1}


In [349]:
grid.fit(X_train, y_train)

In [350]:
main_pipeline

In [348]:
main_pipeline.score(X_test, y_test)

0.7642276422764228

In [351]:
# saving the model
import pickle

pickle.dump(main_pipeline, open('/Users/hople/Desktop/Bootcamp Lectures/week_7/mini-project-4/Model_Pickle/credit_classifier.pkl', 'wb'))

In [352]:
# importing models
with open('/Users/hople/Desktop/Bootcamp Lectures/week_7/mini-project-4/Model_Pickle/credit_classifier.pkl', 'rb') as f:
   credit_predict = pickle.load(f)

In [388]:
credit_predict.predict(pd.DataFrame(X_test_1.to_dict()))

array([0])

In [382]:
X_test_1 = pd.DataFrame(X_test.iloc[6:7,:])

In [399]:
credit_predict.predict(X_test_1)

array([0])

In [395]:

json_data = [
    {
    "Gender" : "Male",
    "Married" : "Yes",
    "Dependents" : "3+",
    "Education" : "Not Graduate",
    "Self_Employed" : "No",
    "LoanAmount" : 70,
    "Loan_Amount_Term" : 180.0,
    "Credit_History" : 0,
    "Property_Area" : "Urban",
    "Total_Income" : 4611,
    }
]

In [396]:
X_test_2 = pd.DataFrame(json_data)