In [1]:
# importing required libraries
import warnings
warnings.simplefilter('ignore')
import boto3, io, os
import sagemaker

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# reading the dataset
bucket='linearlearnerbucket'
file_key = 'csv_files/loan_data.csv'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=file_key)
dataset = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [3]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# checking missing values in the table
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
# filling missing values of categorical variables with mode
dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True)
dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True)
dataset['Dependents'].fillna(dataset['Dependents'].mode()[0], inplace=True)
dataset['Self_Employed'].fillna(dataset['Self_Employed'].mode()[0], inplace=True)
dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0], inplace=True)
dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0], inplace=True)

In [6]:
# filling missing values of continuous variables with mean
dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(), inplace=True)

In [7]:
# checking missing values after imputation
dataset.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
# converting the categories into numbers using map function
dataset['Gender'] = dataset['Gender'].map({'Male': 0, 'Female': 1})
dataset['Married'] = dataset['Married'].map({'No': 0, 'Yes':1})
dataset['Dependents'] = dataset['Dependents'].map({'0': 0, '1': 1, '2': 2, '3+': 3})
dataset['Self_Employed'] = dataset['Self_Employed'].map({'No': 0, 'Yes': 1})
dataset['Education'] = dataset['Education'].map({'Not Graduate': 0, 'Graduate': 1})
dataset['Property_Area'] = dataset['Property_Area'].map({'Rural': 0, 'Semiurban': 1, 'Urban': 2})
dataset['Loan_Status'] = dataset['Loan_Status'].map({'N':0, 'Y': 1})

In [9]:
# bringing the variables in the range of 0 to 1
for i in dataset.columns[1:]:
  dataset[i] = (dataset[i] - dataset[i].min()) / (dataset[i].max() - dataset[i].min())

In [10]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.0,0.0,0.0,1.0,0.0,0.070489,0.0,0.19886,0.74359,1.0,1.0,1.0
1,LP001003,0.0,1.0,0.333333,1.0,0.0,0.05483,0.036192,0.172214,0.74359,1.0,0.0,0.0
2,LP001005,0.0,1.0,0.0,1.0,1.0,0.03525,0.0,0.082489,0.74359,1.0,1.0,1.0
3,LP001006,0.0,1.0,0.0,0.0,0.0,0.030093,0.056592,0.160637,0.74359,1.0,1.0,1.0
4,LP001008,0.0,0.0,0.0,1.0,0.0,0.072356,0.0,0.191027,0.74359,1.0,1.0,1.0


In [11]:
# uploading normalized data into bucket
dataset.to_csv('normalized_loan.csv', index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('csv_files', 'normalized_loan.csv')).upload_file('normalized_loan.csv')

In [12]:
# reading normalized data
file_key = 'csv_files/normalized_loan.csv'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=file_key)
data = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [13]:
# removing Loan_ID
data = data.drop('Loan_ID', axis=1)
data.shape

(614, 12)

In [14]:
# storing all the independent variables or features as X
X = data.drop('Loan_Status', axis=1)
# storing all the dependent variables or target as y
y = data['Loan_Status']

In [15]:
# data is split into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train_display = X.loc[X_train.index]

In [16]:
# training data is split into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
X_train_display = X.loc[X_train.index]
X_val_display = X.loc[X_val.index]

In [17]:
train = pd.concat([pd.Series(y_train, index=X_train.index,
                             dtype=int), X_train], axis=1)
validation = pd.concat([pd.Series(y_val, index=X_val.index,
                                  dtype=int), X_val], axis=1)
test = pd.concat([pd.Series(y_test, index=X_test.index,
                            dtype=int), X_test], axis=1)

In [18]:
# Use 'csv' format to store the data
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)
test.to_csv('test.csv', index=False, header=False)

In [19]:
# uploading the data to s3
prefix = "linear-loan-prediction"

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation.csv')).upload_file('validation.csv')

In [20]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-1
RoleArn: arn:aws:iam::642456856604:role/c40110a511491l1249198t1w642-SageMakerExecutionRole-26OKRLQKHUOI


In [21]:
from sagemaker.debugger import Rule, rule_configs
from sagemaker.session import TrainingInput

s3_output_location='s3://{}/{}/{}'.format(bucket, prefix, 'linear_linear_model')

container=sagemaker.image_uris.retrieve("linear-learner", region, "1")
print(container)

linear_model=sagemaker.estimator.Estimator(
                   image_uri=container,
                   role=role, 
                   instance_count = 1, 
                   instance_type = 'ml.c4.xlarge',
                   output_path = s3_output_location,
                   sagemaker_session = sagemaker.Session()
)

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


In [22]:
linear_model.set_hyperparameters(
    predictor_type = "binary_classifier",
    binary_classifier_model_selection_criteria = "loss_function",
    epochs = 15,
    mini_batch_size = 200,
    loss = "logistic"
)

In [23]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train.csv"),
    content_type="text/csv", 
    compression=None
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation.csv"),
    content_type="text/csv",
    compression=None
)

In [24]:
linear_model.fit({"train": train_input, "validation": validation_input}, wait=True)

2022-02-08 14:29:35 Starting - Starting the training job...
2022-02-08 14:29:37 Starting - Launching requested ML instancesProfilerReport-1644330575: InProgress
......
2022-02-08 14:30:50 Starting - Preparing the instances for training............
2022-02-08 14:33:06 Downloading - Downloading input data...
2022-02-08 14:33:28 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m

2022-02-08 14:33:59 Uploading - Uploading generated training model[34m[02/08/2022 14:33:54 INFO 140634799998784] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_met

In [25]:
linear_predictor = linear_model.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

----------!

In [35]:
linear_endpoint = linear_predictor.endpoint_name
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [37]:
import json
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(X_test)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [38]:
test_mae_linear = np.mean(np.abs(y_test - test_pred))
test_mae_baseline = np.mean(
    np.abs(y_test - np.median(y_train))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.317
Test MAE Linear: 0.319


In [39]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(y_train), len(y_test))

prediction_accuracy = np.mean((y_test == test_pred_class)) * 100
baseline_accuracy = np.mean((y_test == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 78.9 %
Baseline Accuracy: 68.3 %
