# Imports

In [3]:
from sklearn.utils import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os
import sagemaker

# Model Training 

The model will be defined and trained using the data uploaded to S3.

In [4]:
from sagemaker import get_execution_role

# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

In [16]:
version = input("What version of the prepared data is it?:")

What version of the prepared data is it?: 1


In [6]:
import sagemaker

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

In [11]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
from sagemaker.image_uris import retrieve

container = retrieve(framework = 'xgboost',region = session.boto_region_name, version = "1")

In [14]:
# First we create a SageMaker estimator object for our model.
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. 

In [22]:
import json
with open("data/s3_folders.json","r") as f:
    s3_folder = json.load(f)

In [23]:
test_location = s3_folder[f"model_{version}"]["test"]
val_location = s3_folder[f"model_{version}"]["val"]
train_location = s3_folder[f"model_{version}"]["train"]

In [29]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [30]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-03-30 15:41:01 Starting - Starting the training job...
2022-03-30 15:41:27 Starting - Preparing the instances for trainingProfilerReport-1648654860: InProgress
.........
2022-03-30 15:42:53 Downloading - Downloading input data......
2022-03-30 15:43:54 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-03-30:15:43:57:INFO] Running standalone xgboost training.[0m
[34m[2022-03-30:15:43:57:INFO] File size need to be processed in the node: 292.66mb. Available memory size in the node: 8489.82mb[0m
[34m[2022-03-30:15:43:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:43:57] S3DistributionType set as FullyReplicated[0m
[34m[15:44:00] 20456x6000 matrix with 122736000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-03-30:15:44:00:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:44:00] S3DistributionType set as FullyReplicated[0m
[34m[15:44:01] 5113x6000 

### Test the model

In [33]:
#Create a transformer object
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [34]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

..............................[34mArguments: serve[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[35mArguments: serve[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-30 16:36:48 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-03-30 16:36:48 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 22[0m
[34m[2022-03-30 16:36:48 +0000] [23] [INFO] Booting worker with pid: 23

In [35]:
xgb_transformer.wait()

[34mArguments: serve[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[35mArguments: serve[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-30 16:36:48 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-03-30 16:36:48 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 22[0m
[34m[2022-03-30 16:36:48 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subpro

In [38]:
data_dir = "results"

In [39]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-2-730413480526/xgboost-2022-03-30-16-31-55-849/test.csv.out to results/test.csv.out


In [40]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [61]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None)
test_y = list(test_y[0])

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("accuracy: ", accuracy_score(test_y, predictions))
print("precision: ", precision_score(test_y, predictions))
print("recall: ", recall_score(test_y, predictions))

accuracy:  0.9416549350852494
precision:  0.8613861386138614
recall:  0.19506726457399104


In [65]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)

array([[5933,   14],
       [ 359,   87]])