####  XGBoost SageMaker (Deploy)
Práctica Churn

In [None]:
import pandas as pd
import sagemaker
from sagemaker.predictor import csv_serializer
import boto3
import matplotlib.pyplot as plt
import os
import sklearn.model_selection
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer,LabelEncoder
import numpy as np



Uso de boto3 para acceder a los servicios de AWS

In [None]:
s3 = boto3.resource('s3')

# Lista todos los buckets
for bucket in s3.buckets.all():
    print(bucket.name)

Creamos un bucket en S3 llamado **ai6deploy**

Subimos el dataset al bucket

Con boto3 accedemos al dataset

In [None]:
s3_path = 's3://ai6deploy/WA_Fn-UseC_-Telco-Customer-Churn.csv'
s3_object = s3.Bucket('ai6deploy').Object('WA_Fn-UseC_-Telco-Customer-Churn.csv').get()
df = pd.read_csv(s3_object['Body'])
df.head()

Pasamos los datos categóricos a numéricos

In [None]:
df1 = df.drop(['customerID','gender','PhoneService'],axis=1).copy()
le = LabelEncoder()
df1['Churn']=le.fit_transform(df1['Churn'])

df1['TotalCharges']= df1['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)

df1[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]= df1[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']].replace('No internet service','No')

df1= pd.get_dummies(df1)
df1= df1.drop('Churn', axis=1)

X = df1.replace({False: 0, True: 1}, inplace=True)
y= df1['Churn']


In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.33)
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(X_train, Y_train, test_size=0.33)

In [None]:
data_dir = './data/churn_pred'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

#### Cargar a S3

In [None]:
prefix = "churn_xgboost"

In [None]:
s3.Bucket('ai6deploy').upload_file(os.path.join(data_dir, 'test.csv'), prefix+'/test.csv')
s3.Bucket('ai6deploy').upload_file(os.path.join(data_dir, 'validation.csv'), prefix+'/validation.csv')
s3.Bucket('ai6deploy').upload_file(os.path.join(data_dir, 'train.csv'), prefix+'/train.csv')

#### Entrenar un modelo Xgboost

#Region name
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

print(role)

In [None]:
#get container
container = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version="latest"
)

In [None]:
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    output_path='s3://{}/{}/output'.format(sagemaker_session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    sagemaker_session=sagemaker_session)

In [None]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)

In [None]:
s3_input_train = sagemaker.TrainingInput(s3_data="s3://ai6deploy/churn_xgboost/train.csv", content_type='csv')
s3_input_validation = sagemaker.TrainingInput(s3_data="s3://ai6deploy/churn_xgboost/validation.csv", content_type='csv')

In [None]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

### Test the model

In [None]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
test_location = 's3://ai6deploy/churn_xgboost/test.csv'

In [None]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
xgb_transformer.output_path

In [None]:
#s3://sagemaker-us-east-1-971422715962/xgboost-2024-10-18-20-33-21-334/test.csv.out
    
s3_object = s3.Bucket('sagemaker-us-east-1-971422715962').Object('xgboost-2024-10-18-20-33-21-334/test.csv.out').get()
Y_pred = pd.read_csv(s3_object['Body'])
Y_pred.head()

### Deploy the trained model

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

### Use the model

In [None]:
X_test.values

In [None]:
payload = [[str(entry) for entry in row] for row in X_test.values]
payload = '\n'.join([','.join(row) for row in payload])
payload

In [None]:
endpoint_name='xgboost-2024-10-19-03-31-56-492'

In [None]:
# This time we use the sagemaker runtime client rather than the sagemaker client so that we can invoke
# the endpoint that we created.
response = session.sagemaker_runtime_client.invoke_endpoint(
                                                EndpointName = endpoint_name,
                                                ContentType = 'text/csv',
                                                Body = payload)

# We need to make sure that we deserialize the result of our endpoint call.
result = response['Body'].read().decode("utf-8")
Y_pred = np.fromstring(result, sep=',')

In [None]:
Y_pred

### Create Lambda function

```python
# We need to use the low-level library to interact with SageMaker since the SageMaker API
# is not available natively through Lambda.
import boto3

# And we need the regular expression library to do some of the data processing
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words
    
def bow_encoding(words, vocabulary):
    bow = [0] * len(vocabulary) # Start by setting the count for each word in the vocabulary to zero.
    for word in words.split():  # For each word in the string
        if word in vocabulary:  # If the word is one that occurs in the vocabulary, increase its count.
            bow[vocabulary[word]] += 1
    return bow


def lambda_handler(event, context):
    
    vocab = "*** ACTUAL VOCABULARY GOES HERE ***"
    
    words = review_to_words(event['body'])
    bow = bow_encoding(words, vocab)

    # The SageMaker runtime is what allows us to invoke the endpoint that we've created.
    runtime = boto3.Session().client('sagemaker-runtime')

    # Now we use the SageMaker runtime to invoke our endpoint, sending the review we were given
    response = runtime.invoke_endpoint(EndpointName = '***ENDPOINT NAME HERE***',# The name of the endpoint we created
                                       ContentType = 'text/csv',                 # The data format that is expected
                                       Body = ','.join([str(val) for val in bow]).encode('utf-8')) # The actual review

    # The response is an HTTP response whose body contains the result of our inference
    result = response['Body'].read().decode('utf-8')
    
    # Round the result so that our web app only gets '1' or '0' as a response.
    result = round(float(result))

    return {
        'statusCode' : 200,
        'headers' : { 'Content-Type' : 'text/plain', 'Access-Control-Allow-Origin' : '*' },
        'body' : str(result)
    }
```