# Imports

In [3]:
from sklearn.utils import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os
import sagemaker

In [2]:
from data import s3_folders

unbalanced = s3_folders.unbalanced
underSample = s3_folders.underSample
overSample = s3_folders.overSample
combined = s3_folders.combined

# Model Training 

The model will be defined and trained using the data uploaded to S3.

In [4]:
import sagemaker
from sagemaker import get_execution_role

# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

session = sagemaker.Session() # Store the current SageMaker session

In [11]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
from sagemaker.image_uris import retrieve

container = retrieve(framework = 'xgboost',region = session.boto_region_name, version = "1")

## 1. Unbalanced. 

We set the version variable for it will help us with file management.

In [16]:
version = "unbalanced"

What version of the prepared data is it?: 1


Here we load the location of the files on S3 that were uploaded for this model. 

In [None]:
test_location = unbalanced["test"] #the information is loaded from the .py file created in the data preparation step
val_location = unbalanced["val"]
train_location = unbalanced["train"]

In [6]:
# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

In [14]:
# First we create a SageMaker estimator object for our model.
xgb_unbalanced = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb_unbalanced.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. This variables will let the model know where to find the information in S3 that will be used to estimate the model

In [29]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [30]:
xgb_unbalanced.fit({'train': s3_input_train, 'validation': s3_input_validation})

2022-03-30 15:41:01 Starting - Starting the training job...
2022-03-30 15:41:27 Starting - Preparing the instances for trainingProfilerReport-1648654860: InProgress
.........
2022-03-30 15:42:53 Downloading - Downloading input data......
2022-03-30 15:43:54 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-03-30:15:43:57:INFO] Running standalone xgboost training.[0m
[34m[2022-03-30:15:43:57:INFO] File size need to be processed in the node: 292.66mb. Available memory size in the node: 8489.82mb[0m
[34m[2022-03-30:15:43:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:43:57] S3DistributionType set as FullyReplicated[0m
[34m[15:44:00] 20456x6000 matrix with 122736000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-03-30:15:44:00:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:44:00] S3DistributionType set as FullyReplicated[0m
[34m[15:44:01] 5113x6000 

### Test the model

For this part of the process a transformer object will be created. This is an object can be understood as a function that used the artifacts (betas) created by the model and then uses them to predict based in a new data set. 
The test dataset will be given to the transformer and the results will be compared to the actual labels that were reserved for the test. 

In [33]:
#Create a transformer object. This will use the artifacts created by the estimator to transform (create a prediction) using the testing dataset.
xgb_unbalanced_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [34]:
xgb_unbalanced_transformer.transform(test_location, content_type='text/csv', split_type='Line') 
#the location of the test set is passed to the transfomer to perform the transformation. (predict)

..............................[34mArguments: serve[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[35mArguments: serve[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-30 16:36:48 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-03-30 16:36:48 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 22[0m
[34m[2022-03-30 16:36:48 +0000] [23] [INFO] Booting worker with pid: 23

In [35]:
xgb_unbalanced_transformer.wait() #we wait until the transformer is done

[34mArguments: serve[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[35mArguments: serve[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2022-03-30 16:36:48 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-03-30 16:36:48 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-03-30 16:36:48 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-03-30:16:36:48:INFO] Model loaded successfully for worker : 22[0m
[34m[2022-03-30 16:36:48 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subpro

After the tranformation is done, we will specify a new folder where the results (the are created in a S3 folder) can be downloaded from S3.

In [38]:
data_dir = f"results_{version}"

Next, using the next command we download the predictions made by the transformer object into the local folder.

In [39]:
!aws s3 cp --recursive $xgb_unbalanced_transformer.output_path $data_dir

download: s3://sagemaker-us-east-2-730413480526/xgboost-2022-03-30-16-31-55-849/test.csv.out to results/test.csv.out


Now the predictions are read with pandas into a dataframe.

In [40]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None) #the header is none because the first line is a prediction and not the name of the column
predictions = [round(num) for num in predictions.squeeze().values] #we convert the predictions to a list so it will be easier to compare with metrics with the real label.

Then we read the labels that were reserved in the data preparation in a local folder for the test.

In [61]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None) 
test_y = list(test_y[0]) #we transform the first column (not index) to a list so it will be compared with the predictions.

Now the results are compared by creating a confusion matrix out of the predictions vs the real labels. 
From the confusion matrix we can calculate the metric for classfication models. We use the tools in the sklearn module.

In [65]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)
cm

array([[5933,   14],
       [ 359,   87]])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix. Model : {version}')
ax.xaxis.set_ticklabels(['Normal', 'Violent'])
ax.yaxis.set_ticklabels(['Normal', 'Violent'])

In [66]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("Accuracy: ", accuracy_score(test_y, predictions))
print("Precision: ", precision_score(test_y, predictions))
print("Recall: ", recall_score(test_y, predictions))

accuracy:  0.9416549350852494
precision:  0.8613861386138614
recall:  0.19506726457399104
