# Imports

In [None]:
from sklearn.utils import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os
import sagemaker
import re

In [None]:
from data import s3_folders

unbalanced = s3_folders.unbalanced
underSample = s3_folders.underSample
overSample = s3_folders.overSample
combined = s3_folders.combined

# Model Training 

The model will be defined and trained using the data uploaded to S3.

In [None]:
import sagemaker
from sagemaker import get_execution_role

# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

session = sagemaker.Session() # Store the current SageMaker session

In [None]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
from sagemaker.image_uris import retrieve

container = retrieve(framework = 'xgboost',region = session.boto_region_name, version = "1")

## 1. Unbalanced. 

We set the version variable for it will help us with file management.

In [None]:
version = "unbalanced"

Here we load the location of the files on S3 that were uploaded for this model. 

In [None]:
test_location = unbalanced["test"] #the information is loaded from the .py file created in the data preparation step
val_location = unbalanced["val"]
train_location = unbalanced["train"]

In [None]:
# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

### Set the model

In [None]:
# First we create a SageMaker estimator object for our model.
xgb_unbalanced = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb_unbalanced.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. This variables will let the model know where to find the information in S3 that will be used to estimate the model

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
xgb_unbalanced.fit({'train': s3_input_train, 'validation': s3_input_validation})

### Test the model

For this part of the process a transformer object will be created. This is an object can be understood as a function that used the artifacts (betas) created by the model and then uses them to predict based in a new data set. 
The test dataset will be given to the transformer and the results will be compared to the actual labels that were reserved for the test. 

In [None]:
#Create a transformer object. This will use the artifacts created by the estimator to transform (create a prediction) using the testing dataset.
xgb_unbalanced_transformer = xgb_unbalanced.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
xgb_unbalanced_transformer.transform(test_location, content_type='text/csv', split_type='Line') 
#the location of the test set is passed to the transfomer to perform the transformation. (predict)

In [None]:
xgb_unbalanced_transformer.wait() #we wait until the transformer is done

After the tranformation is done, we will specify a new folder where the results (the are created in a S3 folder) can be downloaded from S3.

In [None]:
data_dir = f"results_{version}"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

Next, using the next command we download the predictions made by the transformer object into the local folder.

In [None]:
!aws s3 cp --recursive $xgb_unbalanced_transformer.output_path $data_dir

Now the predictions are read with pandas into a dataframe.

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None) #the header is none because the first line is a prediction and not the name of the column
predictions = [round(num) for num in predictions.squeeze().values] #we convert the predictions to a list so it will be easier to compare with metrics with the real label.

Then we read the labels that were reserved in the data preparation in a local folder for the test.

In [None]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None) 
test_y = list(test_y[0]) #we transform the first column (not index) to a list so it will be compared with the predictions.

Now the results are compared by creating a confusion matrix out of the predictions vs the real labels. 
From the confusion matrix we can calculate the metric for classfication models. We use the tools in the sklearn module.

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)
cm

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title(f'Confusion Matrix. Model : {version}')
ax.xaxis.set_ticklabels(['Normal', 'Violent'])
ax.yaxis.set_ticklabels(['Normal', 'Violent'])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("Accuracy: ", accuracy_score(test_y, predictions))
print("Precision: ", precision_score(test_y, predictions))
print("Recall: ", recall_score(test_y, predictions))

## 2. UnderSampled

We set the version variable for it will help us with file management.

In [None]:
version = "underSample"

Here we load the location of the files on S3 that were uploaded for this model. 

In [None]:
test_location = underSample["test"] #the information is loaded from the .py file created in the data preparation step
val_location = underSample["val"]
train_location = underSample["train"]

In [None]:
# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

### Set the model

In [None]:
# First we create a SageMaker estimator object for our model.
xgb_underSample = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb_underSample.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. This variables will let the model know where to find the information in S3 that will be used to estimate the model

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
xgb_underSample.fit({'train': s3_input_train, 'validation': s3_input_validation})

### Test the model

For this part of the process a transformer object will be created. This is an object can be understood as a function that used the artifacts (betas) created by the model and then uses them to predict based in a new data set. 
The test dataset will be given to the transformer and the results will be compared to the actual labels that were reserved for the test. 

In [None]:
#Create a transformer object. This will use the artifacts created by the estimator to transform (create a prediction) using the testing dataset.
xgb_underSample_transformer = xgb_underSample.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
xgb_underSample_transformer.transform(test_location, content_type='text/csv', split_type='Line') 
#the location of the test set is passed to the transfomer to perform the transformation. (predict)

In [None]:
xgb_underSample_transformer.wait() #we wait until the transformer is done

After the tranformation is done, we will specify a new folder where the results (the are created in a S3 folder) can be downloaded from S3.

In [None]:
data_dir = f"results_{version}"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

Next, using the next command we download the predictions made by the transformer object into the local folder.

In [None]:
!aws s3 cp --recursive $xgb_underSample_transformer.output_path $data_dir

Now the predictions are read with pandas into a dataframe.

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None) #the header is none because the first line is a prediction and not the name of the column
predictions = [round(num) for num in predictions.squeeze().values] #we convert the predictions to a list so it will be easier to compare with metrics with the real label.

Then we read the labels that were reserved in the data preparation in a local folder for the test.

In [None]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None) 
test_y = list(test_y[0]) #we transform the first column (not index) to a list so it will be compared with the predictions.

Now the results are compared by creating a confusion matrix out of the predictions vs the real labels. 
From the confusion matrix we can calculate the metric for classfication models. We use the tools in the sklearn module.

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)
cm

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title(f'Confusion Matrix. Model : {version}')
ax.xaxis.set_ticklabels(['Normal', 'Violent'])
ax.yaxis.set_ticklabels(['Normal', 'Violent'])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("Accuracy: ", accuracy_score(test_y, predictions))
print("Precision: ", precision_score(test_y, predictions))
print("Recall: ", recall_score(test_y, predictions))

## 3. OverSampled

We set the version variable for it will help us with file management.

In [None]:
version = "overSample"

Here we load the location of the files on S3 that were uploaded for this model. 

In [None]:
test_location = overSample["test"] #the information is loaded from the .py file created in the data preparation step
val_location = overSample["val"]
train_location = overSample["train"]

In [None]:
# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

### Set the model

In [None]:
# First we create a SageMaker estimator object for our model.
xgb_overSample = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb_overSample.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. This variables will let the model know where to find the information in S3 that will be used to estimate the model

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
xgb_overSample.fit({'train': s3_input_train, 'validation': s3_input_validation})

### Test the model

For this part of the process a transformer object will be created. This is an object can be understood as a function that used the artifacts (betas) created by the model and then uses them to predict based in a new data set. 
The test dataset will be given to the transformer and the results will be compared to the actual labels that were reserved for the test. 

In [None]:
#Create a transformer object. This will use the artifacts created by the estimator to transform (create a prediction) using the testing dataset.
xgb_overSample_transformer = xgb_overSample.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
xgb_overSample_transformer.transform(test_location, content_type='text/csv', split_type='Line') 
#the location of the test set is passed to the transfomer to perform the transformation. (predict)

In [None]:
xgb_overSample_transformer.wait() #we wait until the transformer is done

After the tranformation is done, we will specify a new folder where the results (the are created in a S3 folder) can be downloaded from S3.

In [None]:
data_dir = f"results_{version}"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

Next, using the next command we download the predictions made by the transformer object into the local folder.

In [None]:
!aws s3 cp --recursive $xgb_overSample_transformer.output_path $data_dir

Now the predictions are read with pandas into a dataframe.

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None) #the header is none because the first line is a prediction and not the name of the column
predictions = [round(num) for num in predictions.squeeze().values] #we convert the predictions to a list so it will be easier to compare with metrics with the real label.

Then we read the labels that were reserved in the data preparation in a local folder for the test.

In [None]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None) 
test_y = list(test_y[0]) #we transform the first column (not index) to a list so it will be compared with the predictions.

Now the results are compared by creating a confusion matrix out of the predictions vs the real labels. 
From the confusion matrix we can calculate the metric for classfication models. We use the tools in the sklearn module.

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)
cm

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title(f'Confusion Matrix. Model : {version}')
ax.xaxis.set_ticklabels(['Normal', 'Violent'])
ax.yaxis.set_ticklabels(['Normal', 'Violent'])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("Accuracy: ", accuracy_score(test_y, predictions))
print("Precision: ", precision_score(test_y, predictions))
print("Recall: ", recall_score(test_y, predictions))

## 4. Combined

We set the version variable for it will help us with file management.

In [None]:
version = "combined"

Here we load the location of the files on S3 that were uploaded for this model. 

In [None]:
test_location = combined["test"] #the information is loaded from the .py file created in the data preparation step
val_location = combined["val"]
train_location = combined["train"]

In [None]:
# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

### Set the model

In [None]:
# First we create a SageMaker estimator object for our model.
xgb_combined = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb_combined.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

### Fit the model

Set the training and validation data set on s3 to be used by sagemaker. This variables will let the model know where to find the information in S3 that will be used to estimate the model

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
xgb_combined.fit({'train': s3_input_train, 'validation': s3_input_validation})

### Test the model

For this part of the process a transformer object will be created. This is an object can be understood as a function that used the artifacts (betas) created by the model and then uses them to predict based in a new data set. 
The test dataset will be given to the transformer and the results will be compared to the actual labels that were reserved for the test. 

In [None]:
#Create a transformer object. This will use the artifacts created by the estimator to transform (create a prediction) using the testing dataset.
xgb_combined_transformer = xgb_combined.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
xgb_combined_transformer.transform(test_location, content_type='text/csv', split_type='Line') 
#the location of the test set is passed to the transfomer to perform the transformation. (predict)

In [None]:
xgb_combined_transformer.wait() #we wait until the transformer is done

After the tranformation is done, we will specify a new folder where the results (the are created in a S3 folder) can be downloaded from S3.

In [None]:
data_dir = f"results_{version}"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

Next, using the next command we download the predictions made by the transformer object into the local folder.

In [None]:
!aws s3 cp --recursive $xgb_combined_transformer.output_path $data_dir

Now the predictions are read with pandas into a dataframe.

In [None]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None) #the header is none because the first line is a prediction and not the name of the column
predictions = [round(num) for num in predictions.squeeze().values] #we convert the predictions to a list so it will be easier to compare with metrics with the real label.

Then we read the labels that were reserved in the data preparation in a local folder for the test.

In [None]:
test_y = pd.read_csv(f"data_prepared_{version}/test_y.csv",header = None) 
test_y = list(test_y[0]) #we transform the first column (not index) to a list so it will be compared with the predictions.

Now the results are compared by creating a confusion matrix out of the predictions vs the real labels. 
From the confusion matrix we can calculate the metric for classfication models. We use the tools in the sklearn module.

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, predictions, labels=None, sample_weight=None, normalize=None)
cm

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title(f'Confusion Matrix. Model : {version}')
ax.xaxis.set_ticklabels(['Normal', 'Violent'])
ax.yaxis.set_ticklabels(['Normal', 'Violent'])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f"Model version: {version}")
print("Accuracy: ", accuracy_score(test_y, predictions))
print("Precision: ", precision_score(test_y, predictions))
print("Recall: ", recall_score(test_y, predictions))

#### The combined approach is the one that maximizes the Recalll metric. Therefore, we will choose this model.
We will save the name of the training job so it can be accessed later in the deployment.

In [None]:
training_job_name = xgb_combined._current_job_name
print(type(training_job_name))

In [None]:
import json

with open("data/training_job_AWS.json", "w") as file:
    json.dump({"training_job":training_job_name},file)