In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from copy import deepcopy
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle
import os

# Get the current working directory
cwd = os.getcwd()

# Define the directory path relative to the current working directory
dirname = os.path.join(cwd, 'Project 1')


In [6]:
def project_1_scoring(data):
    # replace Na/Null values
    values_to_fill = {}
    for col in data.drop(columns=['MIS_Status']).columns:
        if data[col].dtype == 'object':
            values_to_fill[col] = "Missing"
        else:
            values_to_fill[col] = 0

    data.fillna(value=values_to_fill,inplace=True)
    
    #Converting the strings styled as '$XXXX.XX' to float values.
    data['DisbursementGross'] = data['DisbursementGross'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    data['BalanceGross'] = data['BalanceGross'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    data['GrAppv'] = data['GrAppv'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    data['SBA_Appv'] = data['SBA_Appv'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    
    data['MIS_Status'] = data['MIS_Status'].replace({'CHGOFF': 1, 'P I F': 0}).astype(float)
    data.dropna(inplace=True)
    
    #Adding Engineered Features
    
    #1. Creating a feature that is indicating whether the borrower is located in an urban or rural area
    data['Is_urban'] = (data['UrbanRural'] == 1).astype(int)
    
    #2. Creating a feature that is indicating the ratio of retained jobs to created jobs
    data['Retained_Created_Job_Ratio'] = data['RetainedJob'] /(data['CreateJob'] + 1)
    
    #3. Creating a feature that is indicating whether the borrower has a low documentation loan
    data['Is_low_doc'] = (data['LowDoc'] == 'Y').astype(int)
    
    #4. Creating a feature that is indicating whether the borrower has a revolving line of credit
    data['Is_rev_line'] = (data['RevLineCr'] == 'Y').astype(int)
    
    #5. Creating a feature that is indicating the ratio of the loan amount to the gross disbursement
    data['Loan_Gross_Ratio'] = data['GrAppv'] / data['DisbursementGross']
    
    #6. Creating a feature that is indicating the ratio of the SBA loan amount to the gross disbursement
    data['SBA_Loan_Gross_Ratio'] = data['SBA_Appv'] / data['DisbursementGross']
    
    #7. Creating a binary variable feature that is indicating whether the loan was for a franchise
    data['Is_franchise'] = (data['FranchiseCode'] != 0).astype(int)    
    
    #8. Creating a feature that is log transformation of Disbursement Gross
    data['LogDisbursementGross'] = np.log(data['DisbursementGross'] + 1)
    
    #9. Creating a feature that is Log Transformation of SBA Approval Amount
    data['LogSBAApprovalAmount'] = np.log(data['SBA_Appv'] + 1)

    #10. Creating a feature that is indicating the bank originated Loan
    data['BankOriginatedLoan'] = np.where(data['Bank'] == data['BankState'], 1, 0)

    #11. Creating a feature that is indicating loan amount to income ratio or disbursement amount per employees
    data['LoanToIncomeRatio'] = data['DisbursementGross'] / (data['NoEmp'] + 1)

    #12. Creating a feature that is indicating loan amount to number of business owners Ratio
    data['LoanToOwnerRatio'] = data['DisbursementGross'] / (data['NewExist'] + 1)
    
    
    #Categorical encoders dictionary
    cat_encoders = {}
    #New categorical (encoded) columns
    cat_enc_columns = []

    for col in data.columns:
        if data[col].dtype == 'object':
            if data[col].nunique() < 10:
                print("One-hot encoding of ", col)
                enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
                enc.fit(data[[col]])
                result = enc.transform(data[[col]])
                ohe_columns = [col+"_"+str(x) for x in enc.categories_[0]]
                cat_enc_columns = cat_enc_columns + ohe_columns
                result_df = pd.DataFrame(result, columns=ohe_columns, index=data.index)
                data = pd.concat([data, result_df.reindex(data.index)], axis=1, join='inner')
                cat_encoders[col] = [deepcopy(enc), "ohe"]
            else:
                print("Target encoding of ", col)
                enc = TargetEncoder()
                enc.fit(data[col], y=data['MIS_Status'], handle_unknown='value')
                pickle.dump(enc, open(col+'_trg_'+'pre_processing.p', "wb"))
                new_col_name = col+"_trg"
                data[new_col_name] = enc.transform(data[[col]])
                cat_encoders[col] = [deepcopy(enc), "trg"]
                cat_enc_columns.append(new_col_name)
                
    
    col20 = data.pop(data.columns[20])
    data.insert(1, col20.name, col20)
    
    
    num_scalers = {}

    '''Scaling only original and feature engineered columns'''
    for col in data.columns[2:33]:
        if data[col].dtype != 'object':
            print("Standard scale of ", col)
            scaler = StandardScaler()
            scaler.fit(data[[col]])
            pickle.dump(scaler, open(col+'_sc_'+'pre_processing.p', "wb"))
            data[col+"_sc"] = scaler.transform(data[[col]])
        
            num_scalers[col] = [deepcopy(scaler),"Standard"]
            
            
    #Splitting the dataset into train (60%), validation (20%), and test (20%) sets
    train_val, test = train_test_split(data, test_size=0.2, random_state=182)
    train, val = train_test_split(train_val, test_size=0.25, random_state=182)
    
    X_train = train.drop(columns='MIS_Status')
    y_train = train['MIS_Status']
    X_valid = val.drop(columns='MIS_Status')
    y_valid = val['MIS_Status']
    X_test = test.drop(columns='MIS_Status')
    y_test = test['MIS_Status']
    X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape
    
    X_trn = X_train.iloc[:,33:]
    X_vld = X_valid.iloc[:,33:]
    X_tst = X_test.iloc[:,33:]
    
    
    filepath = os.path.join(dirname, '../artifacts/artifacts_dict_file.pkl')
    # working with the artifact file to import the model and threshold
    artifacts_dict_file = open(filepath, "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()
    logreg = artifacts_dict["model"]
    threshold = artifacts_dict["threshold"]
    
    y_pred_prob = logreg.predict_proba(X_trn)
    y_pred = (y_pred_prob[:,0] < threshold).astype(np.int16)
    answer_dataframe = {"index":X_trn["index"],
         "label":y_pred,
         "probability_0":y_pred_prob[:,0],
         "probability_1":y_pred_prob[:,1]}
    

    return pd.DataFrame(answer_dataframe)

#reading the data and keeping the new test data file in the same folder
datafilepath = os.path.join(dirname, './notebooks/SBA_loans_project_1.zip')
data = pd.read_csv(datafilepath)
# calling the function
answer=project_1_scoring(data)
# printing the answer
print(answer)




Target encoding of  City
Target encoding of  State
Target encoding of  Bank
Target encoding of  BankState
Target encoding of  RevLineCr
One-hot encoding of  LowDoc




Standard scale of  Zip
Standard scale of  NAICS
Standard scale of  Term
Standard scale of  NoEmp
Standard scale of  NewExist
Standard scale of  CreateJob
Standard scale of  RetainedJob
Standard scale of  FranchiseCode
Standard scale of  UrbanRural
Standard scale of  DisbursementGross
Standard scale of  BalanceGross
Standard scale of  GrAppv
Standard scale of  SBA_Appv
Standard scale of  Is_urban
Standard scale of  Retained_Created_Job_Ratio
Standard scale of  Is_low_doc
Standard scale of  Is_rev_line
Standard scale of  Loan_Gross_Ratio
Standard scale of  SBA_Loan_Gross_Ratio
Standard scale of  Is_franchise
Standard scale of  LogDisbursementGross
Standard scale of  LogSBAApprovalAmount
Standard scale of  BankOriginatedLoan
Standard scale of  LoanToIncomeRatio
Standard scale of  LoanToOwnerRatio


KeyError: 'index'