# Project Description

This is a project of machine learning model development on AWS.

# Create SageMaker Session

In [1]:
bucket = 'insert_your_bucket_name_here'
region_name = 'insert_your_region_here'
role = 'insert_your_sagemaker_arn_here'
FRAMEWORK_VERSION = 'insert_your_desired_sklearn_version'

In [2]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import warnings

# Suppress warnings if needed
warnings.filterwarnings('ignore')

# Initialize Boto3 and SageMaker session with the correct region
session = boto3.Session(region_name=region_name)
sm_boto = session.client('sagemaker')

# Create a SageMaker session
sm_session = sagemaker.Session(boto_session=session)
region = sm_session.boto_region_name

print(f'Using bucket: {bucket}')
print(f'Using region: {region}')

Using bucket: thisisthebucketforbigdatasagemaker
Using region: ap-southeast-2


# Data Preprocessing

## Data Loading

In [3]:
df = pd.read_csv('../Dataset/lung-cancer-dataset.csv')
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


## Data Checking

### Check for data shape

In [4]:
df.shape

(309, 16)

In [5]:
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

### Check for missing values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [7]:
df.isna().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

### Renaming columns for consistency

In [8]:
df = df.rename(columns = {'CHRONIC DISEASE':'CHRONIC_DISEASE'})
df = df.rename(columns = {'FATIGUE ':'FATIGUE'})
df = df.rename(columns = {'ALLERGY ':'ALLERGY'})
df = df.rename(columns = {'ALCOHOL CONSUMING':'ALCOHOL_CONSUMING'})
df = df.rename(columns = {'SHORTNESS OF BREATH':'SHORTNESS_OF_BREATH'})
df = df.rename(columns = {'SWALLOWING DIFFICULTY':'SWALLOWING_DIFFICULTY'})
df = df.rename(columns = {'CHEST PAIN':'CHEST_PAIN'})
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
      dtype='object')

## Tidying up the data

Since there is no missing values, we're going to skip missing value handlings

### Replace F to 1 and M to 0

In [9]:
df['GENDER'] = df['GENDER'].replace({'F': 1, 'M': 0})

gender_na_count = df['GENDER'].isna().sum()
gender_unique = df['GENDER'].unique()
print(f'N/A values in Gender: {gender_na_count}')
print(f'Gender values: {gender_unique}')

N/A values in Gender: 0
Gender values: [0 1]


### Replace YES to 1 and NO to 0

In [10]:
df['LUNG_CANCER'] = df['LUNG_CANCER'].replace({'YES': 1, 'NO': 0})

lungcancer_na_count = df['LUNG_CANCER'].isna().sum()
lungcancer_unique = df['LUNG_CANCER'].unique()
print(f'N/A values in Lung Cancer: {lungcancer_na_count}')
print(f'Lung Cancer values: {lungcancer_unique}')

N/A values in Lung Cancer: 0
Lung Cancer values: [1 0]


### Replace the rest of the values as binary

In [11]:
for i in df:
    if i != 'GENDER' and i != 'AGE' and i != 'LUNG_CANCER':
        df[i] = df[i].replace({2: 1, 1: 0})

df.isna().sum() # Recheck for missing values

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC_DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL_CONSUMING        0
COUGHING                 0
SHORTNESS_OF_BREATH      0
SWALLOWING_DIFFICULTY    0
CHEST_PAIN               0
LUNG_CANCER              0
dtype: int64

### Recheck data condition

In [12]:
df.head(10)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,0,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,0,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,1,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,0,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,1,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0
5,1,75,0,1,0,0,1,1,1,1,0,1,1,0,0,1
6,0,52,1,0,0,0,0,1,0,1,1,1,1,0,1,1
7,1,51,1,1,1,1,0,1,1,0,0,0,1,1,0,1
8,1,68,1,0,1,0,0,1,0,0,0,0,0,0,0,0
9,0,53,1,1,1,1,1,0,1,0,1,0,0,1,1,1


## Train Test Splitting

### Define features and target

In [13]:
features = list(df.columns)
features

['GENDER',
 'AGE',
 'SMOKING',
 'YELLOW_FINGERS',
 'ANXIETY',
 'PEER_PRESSURE',
 'CHRONIC_DISEASE',
 'FATIGUE',
 'ALLERGY',
 'WHEEZING',
 'ALCOHOL_CONSUMING',
 'COUGHING',
 'SHORTNESS_OF_BREATH',
 'SWALLOWING_DIFFICULTY',
 'CHEST_PAIN',
 'LUNG_CANCER']

In [14]:
target = features.pop(-1)
target

'LUNG_CANCER'

### Splitting features and target

In [15]:
df_input = df[features]
df_output = df[target]

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_input, df_output, test_size = 0.2, random_state = 0)

In [17]:
train = x_train
train[target] = y_train
train

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
7,1,51,1,1,1,1,0,1,1,0,0,0,1,1,0,1
45,0,72,1,1,1,1,1,1,0,1,1,1,1,1,1,1
97,0,59,1,0,0,1,0,0,0,0,1,1,1,0,0,1
92,0,52,1,0,0,0,1,1,1,1,1,0,0,1,1,1
198,1,73,1,1,1,0,1,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,1,64,1,1,1,1,1,1,1,1,0,1,1,1,1,1
192,1,51,1,1,1,1,1,0,0,0,0,0,0,0,0,1
117,1,51,1,1,1,1,0,1,1,0,0,0,1,1,0,1
47,1,64,1,1,0,1,0,1,0,1,1,1,0,1,1,1


In [18]:
test = x_test
test[target] = y_test
test

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
63,1,68,0,0,1,0,1,0,1,1,1,0,0,1,0,1
231,0,64,1,1,1,1,0,1,1,0,1,1,1,0,1,1
167,0,62,0,0,0,0,1,0,1,0,1,1,1,1,1,1
159,0,68,0,0,1,1,1,0,0,0,1,0,0,0,0,0
189,1,67,1,1,1,1,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,0,59,0,1,1,0,0,0,0,0,0,0,0,1,1,0
250,0,69,1,1,1,1,0,1,1,0,0,0,1,1,0,1
33,1,67,1,1,1,0,1,0,0,0,0,0,1,1,1,1
21,1,64,0,1,1,1,0,0,1,1,0,1,0,1,0,1


## Save Splitted Data As CSV

The purpose is to send the splitted train and test data to S3 bucket

In [19]:
train.to_csv('../Dataset/train.csv', index=False)
test.to_csv('../Dataset/test.csv', index=False)

## Upload Data To S3

In [20]:
prefix = 'dataset/lung_cancer'

In [21]:
trainpath = sm_session.upload_data(path='../Dataset/train.csv', bucket=bucket, key_prefix=prefix)
testpath = sm_session.upload_data(path='../Dataset/test.csv', bucket=bucket, key_prefix=prefix)

## Implementation

### Create estimator

In [22]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='script.py', 
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version=FRAMEWORK_VERSION,
    sagemaker_session=sm_session,
    script_mode=True
)

### Train the model asynchronously

Launch training jobs asynchronously using SageMaker

In [23]:
sklearn_estimator.fit({'train': trainpath, 'test': testpath}, wait=True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-07-13-14-20-39-344


2024-07-13 14:20:44 Starting - Starting the training job...
2024-07-13 14:20:59 Starting - Preparing the instances for training...
2024-07-13 14:21:34 Downloading - Downloading the training image......
2024-07-13 14:22:50 Training - Training image download completed. Training in progress...2024-07-13 14:23:01,011 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-07-13 14:23:01,015 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-13 14:23:01,019 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-07-13 14:23:01,040 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-07-13 14:23:01,266 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-13 14:23:01,270 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-07-13 14:23:01,297 sagemaker-training-toolkit INFO   

### Check created model location

In [24]:
sklearn_estimator.latest_training_job.wait(logs='None')
artifact = sm_boto.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']
print(f'Model artifact is located at: {artifact}')


2024-07-13 14:23:18 Starting - Preparing the instances for training
2024-07-13 14:23:18 Downloading - Downloading the training image
2024-07-13 14:23:18 Training - Training image download completed. Training in progress.
2024-07-13 14:23:18 Uploading - Uploading generated training model
2024-07-13 14:23:18 Completed - Training job completed
Model artifact is located at: s3://sagemaker-ap-southeast-2-767397980112/sagemaker-scikit-learn-2024-07-13-14-20-39-344/output/model.tar.gz


## Deployment

### Create SKLearn model for deployment

In [25]:
from sagemaker.sklearn.model import SKLearnModel
from time import strftime, localtime

model_name = 'decision-tree'+ strftime('%Y-%m-%d-%H-%M-%S', localtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role=role,
    entry_point='script.py',
    framework_version=FRAMEWORK_VERSION
)

### Endpoint deployment

In [26]:
endpoint_name = 'decision-tree'+ strftime('%Y-%m-%d-%H-%M-%S', localtime())

In [27]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.large', endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: decision-tree2024-07-13-21-23-38
INFO:sagemaker:Creating endpoint-config with name decision-tree2024-07-13-21-23-38
INFO:sagemaker:Creating endpoint with name decision-tree2024-07-13-21-23-38


----------------------------------------------*

UnexpectedStatusException: Error hosting endpoint decision-tree2024-07-13-21-23-38: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

### Model prediction testing

In [None]:
import random

for i in range(5):
    print(predictor.predict(test[features][random.randint(0, len(test) - 1)].values.tolist()))

### Delete Endpoint

Only use when needed!

In [None]:
# sm_boto.delete_endpoint(Endpoint_Name=endpoint_name)