# Program to run Logistic regression on Adult dataset
### write information about Adult dataset from UCI repository

## Importing Libraries

In [1]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and XGBoost (remove after carefully analysing the code)
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-2 region. You will use the 825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


## Creating Bucket

In [2]:
bucket_name = 'big-data-1' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


## Loading Train data

In [3]:
try:
  train_data = pd.read_csv('adult_df.csv')
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [4]:
train_data.shape

(30162, 17)

In [5]:
train_data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,hours_w,native_region
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,between_40_and_45,United-States


## Loading test data

In [6]:
try:
  model_data = pd.read_csv('test_df.csv')
  print('Success: Test Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Test Data loaded into dataframe.


In [7]:
model_data.shape

(15060, 19)

In [8]:
model_data.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,hours_w,native_region,cap_gain,cap_loss
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,between_40_and_45,United-States,Low,Low
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,between_45_and_60,United-States,Low,Low


## Generating dummy values for Train data
### Developing labels: '0' = <=50K, '1' = >50K

In [9]:
train_data = pd.get_dummies(train_data, columns=[
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "sex"])

In [10]:
train_data.shape

(30162, 67)

In [11]:
label=train_data['income']

In [12]:
train_data=train_data.drop(columns=['income','hours_w','native_region', "native_country"])

In [13]:
train_data.shape

(30162, 63)

In [14]:
temp=label[0]
labels=[]
for i in label:
    if i ==temp:
        labels.append(0)#<=50K
    else:
        labels.append(1)
labels=np.array(labels).astype('float32')
train_data=np.array(train_data).astype('float32')

## Generating dummy values for Test data

In [15]:
model_data = pd.get_dummies(model_data, columns=[
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "sex"])
test_label=model_data['income']
model_data=model_data.drop(columns=['income','hours_w','native_region','cap_gain','cap_loss',"native_country"])


In [16]:
model_data.shape

(15060, 63)

In [17]:
model_data=np.array(model_data).astype('float32')

## Training Process

In [18]:
import sagemaker.amazon.common as smac
import io

In [19]:
key='linearlearner'
sess=sagemaker.Session()
prefix="sagemaker/Income"
buf=io.BytesIO()
smac.write_numpy_to_dense_tensor(buf,train_data,labels)
buf.seek(0)
print(type(buf))
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train',key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket_name,prefix,key)
#sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')
print('upoaded training data location : {}'.format(s3_train_data))
output_location='s3://{}/{}/output'.format(bucket_name,prefix)
print('training artifacts will be uploaded to : {}'.format(output_location))

<class '_io.BytesIO'>
upoaded training data location : s3://big-data-1/sagemaker/Income/train/linearlearner
training artifacts will be uploaded to : s3://big-data-1/sagemaker/Income/output


In [20]:
#REMOVE THIS IF XGBOSST IS NOT REQUIRED 
# sess = sagemaker.Session()
# xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
# xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [26]:
# xgb.fit({'train': s3_input_train})

## Defining Containers

In [21]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest',
              'ap-northeast-1': '351501993468.dkr.ecr.ap-northeast-1.amazonaws.com/linear-learner:latest'}

In [22]:
containers[boto3.Session().region_name]

'404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest'

## Defining Model

In [23]:
from sagemaker import get_execution_role
role = get_execution_role()
linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)

## Setting Hyperparameters

In [24]:
linear.set_hyperparameters(feature_dim=63,
                           predictor_type='binary_classifier',loss='hinge_loss', 
                           normalize_data=False)

## Training the Model

In [25]:
linear.fit({'train': s3_train_data})
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

2019-12-10 16:35:43 Starting - Starting the training job...
2019-12-10 16:35:44 Starting - Launching requested ML instances...
2019-12-10 16:36:39 Starting - Preparing the instances for training......
2019-12-10 16:37:32 Downloading - Downloading input data...
2019-12-10 16:38:14 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34m[12/10/2019 16:38:17 INFO 139764330137408] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'

In [26]:
from sagemaker.predictor import csv_serializer, json_deserializer
linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [65]:
# Small test to check the code
# result = linear_predictor.predict(model_data[0])
# print(result)

## Testing 

In [27]:
predictions = []
scores=[]

for array in model_data:
    result = linear_predictor.predict(array)
#     print(result)
    predictions += [r['predicted_label'] for r in result['predictions']]
    scores += [r['score'] for r in result['predictions']]
#     print(predicted_label)
predictions = np.array(predictions)
# Push into our pandas dataframe
# data['Predicted'] = predictions.astype(int)

In [66]:
# morethan_list=[]
# lessthan_list=[]
# for i in range(0,len(scores)):
#     if predictions[i]==0:
#         lessthan_list.append(scores[i])
#     else:
#         morethan_list.append(scores[i])
# max(lessthan_list)
# min(morethan_list)

# Preparing test labels
### Developing labels: '0' = <=50K, '1' = >50K

In [28]:
temp=test_label[0]
labels=[]
for i in test_label:
    if i ==temp:
        labels.append(0)#<=50K
    else:
        labels.append(1)
labels=np.array(labels).astype('float32')

In [29]:
correct=0
for i in range(0, labels.shape[0]):
    if labels[i]==predictions[i]:
        correct=correct+1

In [30]:
correct#Number of correct labels predicted

11876

## Percentage of correct predicted labels

In [31]:
correct/labels.shape[0]

0.7885790172642763

## Probabilities 

In [94]:
scores

[0.021438121795654297,
 0.05326090753078461,
 6.050603133189725e-06,
 0.0008763669757172465,
 0.038076288998126984,
 0.7235231995582581,
 3.1665356448051796e-10,
 0.13206003606319427,
 0.9999964237213135,
 0.9997689127922058,
 0.9242222309112549,
 0.10236988961696625,
 2.713660933295614e-06,
 0.00039460044354200363,
 9.376765319757396e-07,
 7.23431554661147e-08,
 2.468503907948616e-06,
 0.0006980500766076148,
 0.03615233674645424,
 0.9994522929191589,
 4.238630992858816e-07,
 1.2619689186976757e-06,
 0.47122231125831604,
 0.00012470853107515723,
 0.9886903166770935,
 3.576686538053764e-07,
 0.14343968033790588,
 0.018028369173407555,
 0.0005860200617462397,
 3.657900379039347e-06,
 0.9900798797607422,
 0.00011033770715584978,
 1.5719297152827494e-06,
 8.214288754970767e-06,
 1.865900799202791e-06,
 3.3775831980165094e-06,
 0.6101195812225342,
 0.00011559217819012702,
 0.8873129487037659,
 0.34352925419807434,
 5.51465927856043e-06,
 4.3556150330914534e-07,
 0.9941415190696716,
 0.01155