In [44]:
import os
import boto3
import pandas as pd
import numpy as np
from collinearity import SelectNonCollinear
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
import pickle
import json

In [45]:
# import sys
# !{sys.executable} -m pip install collinearity



In [3]:
bucket = 'loandefaultbucket'
dataset_filename = 'dataset.csv'
subfolder = ''

In [4]:
from sagemaker import get_execution_role
role = get_execution_role()

In [42]:
bucket='loandefaultbucket'
data_key = 'dataset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

In [43]:
data_location

's3://loandefaultbucket/dataset.csv'

In [None]:
's3://sagemaker-us-east-1-590353062014/data'

In [5]:
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=dataset_filename)['Contents']
for f in contents:
    print(f['Key'])
    
# response = conn.get_object(Bucket=bucket, Key=dataset_filename)
# body = response['Body']
# print(body)

# df_data = pd.read_csv(response.get("Body"), sep=";")

dataset.csv


In [6]:
config_file = 'model_config.json'
result = conn.get_object(Bucket=bucket, Key=config_file) 
model_config = result["Body"].read().decode()
model_config_json = json.loads(model_config)
print(model_config_json['target'])

default


In [7]:
def load_data(conn, bucket, dataset_filename):
    """Load data from s3"""

    response = conn.get_object(Bucket=bucket, Key=dataset_filename)
    df_data = pd.read_csv(response.get("Body"), sep=";")

    df_train = df_data[~df_data['default'].isna()]
    df_predict = df_data[df_data['default'].isna()]

    return df_train, df_predict

In [8]:
def save_pickle(bucket, key, var):
    
    pickle_byte_obj = pickle.dumps(var) 
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

In [9]:
# bucket='your_bucket_name'
# key='your_pickle_filename.pkl'
# pickle_byte_obj = pickle.dumps([var1, var2, ..., varn]) 
# s3_resource = boto3.resource('s3')
# s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

In [10]:
def data_preprosessing_missing(data):
    """returns dataframe without missing values"""

    # missing values
    percent_missing = round(data.isnull().sum()/len(data) * 100, 2)
    missing_value_df = pd.DataFrame({'column_name': data.columns,
                                     'percent_missing': percent_missing})
    # delete columns where missing data > 50%
    col_to_delete = missing_value_df[missing_value_df['percent_missing'] > 49].index.to_list()

#     np.savetxt(os.path.join("modelling", "removed_col_50_pct.csv"),
#                col_to_delete,
#                delimiter=",",
#                fmt='% s')

    data = data.drop(col_to_delete, axis=1)
    print("Removed columns with missing data > 50%:{}".format(col_to_delete))

    missing_30_pct = missing_value_df[(missing_value_df['percent_missing'] > 0) & (
        missing_value_df['percent_missing'] < 30)].index.to_list()

    # delete entries where missing data < 30%
    # data = data.dropna()

    # imput median where missing data <30%
    for col in missing_30_pct:
        data[col].fillna((data[col].median()), inplace=True)

    return data


def data_preprosessing_collinearity(X, y, corr_threshold):
    """returns dataframe with eliminated correlated features"""

    selector = SelectNonCollinear(correlation_threshold=corr_threshold)

    features = X.select_dtypes(include=["number"]).columns.to_list()

    X_arr = X.select_dtypes(include=["number"]).to_numpy()

    selector.fit(X_arr, np.ravel(y))
    mask = selector.get_support()

    remove_corr_col = list(
        set(X.select_dtypes(include=["number"]))-set(np.array(features)[mask]))

    # remove the highly correlated columns
    X.drop(remove_corr_col, axis=1, inplace=True)

#     np.savetxt(os.path.join("modelling", "removed_col_collinearity.csv"),
#                remove_corr_col,
#                delimiter=",",
#                fmt='% s')

    print("Removed highly correlated columns:{}".format(remove_corr_col))

    return X, y



def data_preprosessing_onehot(X):
    """returns dataframe with Onehot encoded categorical variables"""

    categorical_col = X.select_dtypes(include=["object"]).columns.to_list()
    X_categorical = X[categorical_col]

    encoder = OneHotEncoder(drop='first')
    encoder.fit(X_categorical)
    X_onehot = encoder.transform(X_categorical)

    X_onehot_df = pd.DataFrame(X_onehot.toarray())

#     pickle.dump(encoder, open(os.path.join(
#         "modelling", "one_hot_encoder.pkl"), 'wb'))
    save_pickle(bucket, "one_hot_encoder.pkl", encoder)

    X_last = pd.concat([X.drop(["merchant_category", "merchant_group",
                       "name_in_email"], axis=1), X_onehot_df.set_index(X.index)], axis=1)

    return X_last


def data_preprosessing_oversampling(X, y):
    """returns oversampled dataset and balanced target feature labels"""

    # transform the dataset
    oversample = SMOTE(random_state=101)
    X, y = oversample.fit_resample(X, y)

    return X, y


def train_model_rf(X, y, bucket):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0, test_size=0.3, stratify=y)

    # train a randomforest classifier
    rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
    rf_clf.fit(X_train, np.ravel(y_train))

    # save trained model
#     pickle.dump(rf_clf, open(os.path.join(
#         "modelling", "rf_clf_model.pkl"), 'wb'))
    save_pickle(bucket, "rf_clf_model.pkl", rf_clf)

    return rf_clf


def data_preprosessing(data, corr_threshold):
    """prepares data for input to model, returns transformend dataframe"""

    # missing data
    data = data_preprosessing_missing(data)

    X = data.drop(["uuid", "default"], axis=1)
    y = data[["default"]]

    # remove collinearity
    X, y = data_preprosessing_collinearity(X, y, corr_threshold)
    # onehot encoder
    X = data_preprosessing_onehot(X)
    # oversampling
    X, y = data_preprosessing_oversampling(X, y)

    return X, y


def data_prep_predict(data, target, selected_features, categorical_features, bucket):
    """function to prepare data for input to model for prediction"""

    # check for default column in test data
    if target in data.columns:
        data = data[data["default"].isna()].drop(["default"], axis=1).reset_index(drop=True)

    # check for all required features
    for col_name in selected_features:
        if col_name not in data.columns:
            raise Exception('Required column  is missing:{}', format(col_name))

#     print('Writing the data to csv file where required column values are missing')

#     data[data[selected_features].isnull().any(axis=1)].to_csv(os.path.join(
#         "predict", 'required_columns_values_missing.csv'))

    # data.dropna(subset=selected_features, inplace=True)

    # input median where missing data <30%
    for col in selected_features:
        if data[col].isnull().sum() != 0:
            data[col] = data[col].fillna(data[col].median())
    
    selected_features.append('uuid')
    
    # filter selected features
    data = data[selected_features]

    # onehot encoding
    df_categorical = data[categorical_features]

#     encoder = pickle.load(open(os.path.join("modelling", "one_hot_encoder.pkl"), 'rb'))

    encoder = load_pickle(bucket, "one_hot_encoder.pkl")
    data_one_hot = encoder.transform(df_categorical)
    df_data_one_hot = pd.DataFrame(data_one_hot.toarray())
    data_last = pd.concat([data.drop(categorical_features, axis = 1), 
                            df_data_one_hot.set_index(data.index)],axis=1)

    return data_last


def predict_default(data_last):
    """function to predict the probability of default and write the result to prediction_default.csv file"""

    # load trained model
#     rf_clf = pickle.load(open(os.path.join("modelling", "rf_clf_model.pkl"), 'rb'))
    rf_clf = load_pickle(bucket, "rf_clf_model.pkl")

    if 'uuid' in data_last.columns:

        data_last['pd_prediction'] = rf_clf.predict(data_last.drop(["uuid"], axis=1))
        df_predicted = data_last[['uuid', 'pd_prediction']]
    else:
        data_last['pd_prediction'] = rf_clf.predict(data_last)
        df_predicted = data_last['pd_prediction']

#     df_predicted.to_csv(os.path.join("predict",'prediction_default.csv'))

#     print('Check predicted pd in: ../predict/prediction_default.csv')
    
    return df_predicted
    

In [11]:
def load_pickle(bucket, pickle_name):
    s3 = boto3.resource('s3')
    pickle_content = pickle.loads(s3.Bucket(bucket).Object(pickle_name).get()['Body'].read())
    
    return pickle_content

In [13]:
target = model_config_json["target"]
selected_features = model_config_json["selected_features"]
categorical_features = model_config_json["categorical_features"]
corr_threshold = model_config_json["corr_threshold"]

In [14]:
# 1.Load Data
df_train, df_predict = load_data(conn, bucket, dataset_filename)
# print(df_train)

In [15]:
df_predict.shape

(10000, 43)

In [16]:
# use part  2 and 3 when training the model
# 2.Data preprosessing
X, y = data_preprosessing(df_train, corr_threshold)

Removed columns with missing data > 50%:['account_incoming_debt_vs_paid_0_24m', 'account_status', 'account_worst_status_0_3m', 'account_worst_status_12_24m', 'account_worst_status_3_6m', 'account_worst_status_6_12m', 'avg_payment_span_0_3m', 'worst_status_active_inv']
Removed highly correlated columns:['status_max_archived_0_12_months', 'max_paid_inv_0_12m', 'num_arch_ok_12_24m']


In [17]:
# 3.Train model
train_model_rf(X, y, bucket)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [18]:
# use part 4 to make prediction
# 4.Prediction
X_prep = data_prep_predict(df_predict, target, selected_features, categorical_features, bucket)

In [19]:
df_predicted = predict_default(X_prep)

In [21]:
df_predicted.head()

Unnamed: 0,uuid,pd_prediction
0,6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,0.0
1,f6f6d9f3-ef2b-4329-a388-c6a687f27e70,0.0
2,e9c39869-1bc5-4375-b627-a2df70b445ea,0.0
3,6beb88a3-9641-4381-beb6-c9a208664dd0,0.0
4,bb89b735-72fe-42a4-ba06-d63be0f4ca36,0.0


In [22]:
import sagemaker

bucket = sagemaker.session.Session().default_bucket()

In [23]:
bucket

'sagemaker-us-east-1-590353062014'

In [24]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session
            
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"}

In [31]:
# set an output path where the trained model will be saved
bucket = sagemaker.Session().default_bucket()
prefix = 'DEMO-xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'abalone-xgb-built-in-algo')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.

# xgboost_container = get_image_uri(boto3.Session().region_name,
#                           'xgboost', 
#                           repo_version='1.2-2')

xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(xgboost_container)

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

In [32]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_name=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.t2.medium', 
                                          train_volume_size=1, # 5 GB 
                                          output_path=output_path)

TypeError: __init__() missing 1 required positional argument: 'image_uri'

In [33]:
prefix = "study-case-xgboost-churn"

In [34]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(xgboost_container)

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

In [None]:
# s3_input_train = TrainingInput(
#     s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv"
# )
# s3_input_validation = TrainingInput(
#     s3_data="s3://{}/{}/validation/".format(bucket, prefix), content_type="csv"
# )

In [36]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    xgboost_container,
    role,
    instance_count=1,
    instance_type="ml.t2.medium",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,
    objective="binary:logistic",
    num_round=100,
)


In [37]:
xgb.fit({"train": X, "validation": y})

ValueError: Cannot format input         account_amount_added_12_24m  account_days_in_dc_12_24m  \
0                                 0                        0.0   
1                                 0                        0.0   
2                                 0                        0.0   
3                                 0                        0.0   
4                                 0                        0.0   
...                             ...                        ...   
177371                            0                        0.0   
177372                        32058                        0.0   
177373                        19151                        0.0   
177374                            0                        0.0   
177375                        96619                        0.0   

        account_days_in_rem_12_24m  account_days_in_term_12_24m  age  \
0                         0.000000                          0.0   20   
1                         0.000000                          0.0   50   
2                         0.000000                          0.0   22   
3                         0.000000                          0.0   36   
4                         0.000000                          0.0   25   
...                            ...                          ...  ...   
177371                    0.000000                          0.0   33   
177372                   68.604513                          0.0   23   
177373                  105.973431                          0.0   28   
177374                    0.000000                          0.0   23   
177375                   79.020063                          0.0   46   

        avg_payment_span_0_12m  has_paid  max_paid_inv_0_24m  \
0                    12.692308      True        31638.000000   
1                    25.833333      True        13749.000000   
2                    20.000000      True        29890.000000   
3                     4.687500      True        40040.000000   
4                    13.000000      True         7100.000000   
...                        ...       ...                 ...   
177371               83.770783      True        10652.112296   
177372               98.726862      True         7935.794445   
177373               14.904762      True         1231.584178   
177374               12.772537      True         7545.690910   
177375               56.736887      True         7546.061458   

        num_active_div_by_paid_inv_0_12m  num_active_inv  ...   64   65  \
0                               0.153846               2  ...  0.0  0.0   
1                               0.000000               0  ...  0.0  0.0   
2                               0.071429               1  ...  0.0  0.0   
3                               0.031250               1  ...  0.0  0.0   
4                               0.000000               0  ...  0.0  0.0   
...                                  ...             ...  ...  ...  ...   
177371                          0.000000               0  ...  0.0  0.0   
177372                          0.000000               0  ...  0.0  0.0   
177373                          0.000000               1  ...  0.0  0.0   
177374                          0.000000               0  ...  0.0  0.0   
177375                          0.000000               0  ...  0.0  0.0   

              66        67   68   69        70   71        72        73  
0       0.000000  0.000000  0.0  0.0  0.000000  0.0  0.000000  1.000000  
1       0.000000  1.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  
2       0.000000  0.000000  0.0  0.0  0.000000  1.0  0.000000  0.000000  
3       0.000000  0.000000  1.0  0.0  0.000000  0.0  0.000000  0.000000  
4       0.000000  1.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  
...          ...       ...  ...  ...       ...  ...       ...       ...  
177371  0.000000  0.147506  0.0  0.0  0.000000  0.0  0.000000  0.852494  
177372  0.000000  0.000000  0.0  0.0  0.000000  0.0  0.000000  0.980064  
177373  0.000000  0.447360  0.0  0.0  0.000000  0.0  0.000000  0.552640  
177374  0.049064  0.000000  0.0  0.0  0.049064  0.0  0.950936  0.000000  
177375  0.000000  1.000000  0.0  0.0  0.000000  0.0  0.000000  0.000000  

[177376 rows x 101 columns]. Expecting one of str, TrainingInput, file_input or FileSystemInput