In [1]:
## Wine Type Prediction
''' [The Wine Type Prediction dataset](https://archive.ics.uci.edu/ml/datasets/Wine) consists of data related to the chemical properties of various wines and classifies each wine into on1 of 3 possible classes. The columns in the dataset are as follows:

|col name|description|
|:--|:--|
|target| This is the target variable to be predicted. There are three possible classes, class 1, 2 and 3 |
|alcohol| continuous | 
|malic_acid| continuous | 
|ash| continuous | 
|alcalinity_of_ash| continuous |    
|magnesium| continuous | 
|total_phenols| continuous | 
|flavanoids| continuous | 
|nonflavanoid_phenols| continuous | 
|proanthocyanins| continuous | 
|color_intensity| continuous | 
|hue| continuous | 
|od280/od315_of_diluted_wines| continuous | 
|proline| continuous | 


- The goal of this project is to build and tune a model to predict the `target` column using AWS Sagemaker and deploy the model as a `Serverless Inference Endpoint`
'''
## Tips: 
''' You can use the below code to get the S3 bucket to write any artifacts to
    ```
    import sagemaker
    session = sagemaker.Session()
    bucket = session.default_bucket()
    ```
- What ML task is this? Classification? Regression? Clustering?
- What are the data types of the columns? What pre-processing should you apply?
- How to determine the best hyperparameters for the model?
- How to test if the model is deployed successfully?'''

' You can use the below code to get the S3 bucket to write any artifacts to\n    ```\n    import sagemaker\n    session = sagemaker.Session()\n    bucket = session.default_bucket()\n    ```\n- What ML task is this? Classification? Regression? Clustering?\n- What are the data types of the columns? What pre-processing should you apply?\n- How to determine the best hyperparameters for the model?\n- How to test if the model is deployed successfully?'

In [2]:
import pandas as pd

cols = [
    "target",
    "alcohol", 
    "malic_acid", 
    "ash", 
    "alcalinity_of_ash",    
    "magnesium", 
    "total_phenols", 
    "flavanoids", 
    "nonflavanoid_phenols", 
    "proanthocyanins", 
    "color_intensity", 
    "hue", 
    "od280/od315_of_diluted_wines", 
    "proline"
]

In [3]:
wine_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", names=cols)

print(wine_df.shape)
wine_df.head()

(178, 14)


Unnamed: 0,target,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = wine_df.drop(columns=['target'])
y = wine_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42, stratify=y)

# Concatenate features and target variable for training and testing sets
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)


In [5]:
train_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
111,12.52,2.43,2.17,21.0,88,2.55,2.27,0.26,1.22,2.00,0.90,2.78,325,2
30,13.73,1.50,2.70,22.5,101,3.00,3.25,0.29,2.38,5.70,1.19,2.71,1285,1
36,13.28,1.64,2.84,15.5,110,2.60,2.68,0.34,1.36,4.60,1.09,2.78,880,1
12,13.75,1.73,2.41,16.0,89,2.60,2.76,0.29,1.81,5.60,1.15,2.90,1320,1
158,14.34,1.68,2.70,25.0,98,2.80,1.31,0.53,2.70,13.00,0.57,1.96,660,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,12.08,1.83,2.32,18.5,81,1.60,1.50,0.52,1.64,2.40,1.08,2.27,480,2
172,14.16,2.51,2.48,20.0,91,1.68,0.70,0.44,1.24,9.70,0.62,1.71,660,3
34,13.51,1.80,2.65,19.0,110,2.35,2.53,0.29,1.54,4.20,1.10,2.87,1095,1
161,13.69,3.26,2.54,20.0,107,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680,3


In [6]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
print(bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker-us-east-1-635439539142


In [7]:
test_df.to_json('../data/wine_test_df_json.json',orient='records',lines=True,index=False)
train_df.to_csv('../data/train_data_wine_pred.csv')
test_df.to_csv('../data/test_data_wine_pred.csv')

In [8]:
train_path = session.upload_data(path='../data/train_data_wine_pred.csv', bucket=bucket, key_prefix = 'akash/wine-prediction')
test_path = session.upload_data(path='../data/test_data_wine_pred.csv', bucket=bucket, key_prefix = 'akash/wine-prediction')
print(f'train path: {train_path}')
print(f'test path: {test_path}')

train path: s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/train_data_wine_pred.csv
test path: s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/test_data_wine_pred.csv


In [9]:
train_df = pd.read_csv('s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/train_data_wine_pred.csv')
train_df.drop('Unnamed: 0',axis=1,inplace=True)
train_df['target'] = train_df['target'] - 1
train_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,12.52,2.43,2.17,21.0,88,2.55,2.27,0.26,1.22,2.00,0.90,2.78,325,1
1,13.73,1.50,2.70,22.5,101,3.00,3.25,0.29,2.38,5.70,1.19,2.71,1285,0
2,13.28,1.64,2.84,15.5,110,2.60,2.68,0.34,1.36,4.60,1.09,2.78,880,0
3,13.75,1.73,2.41,16.0,89,2.60,2.76,0.29,1.81,5.60,1.15,2.90,1320,0
4,14.34,1.68,2.70,25.0,98,2.80,1.31,0.53,2.70,13.00,0.57,1.96,660,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,12.08,1.83,2.32,18.5,81,1.60,1.50,0.52,1.64,2.40,1.08,2.27,480,1
134,14.16,2.51,2.48,20.0,91,1.68,0.70,0.44,1.24,9.70,0.62,1.71,660,2
135,13.51,1.80,2.65,19.0,110,2.35,2.53,0.29,1.54,4.20,1.10,2.87,1095,0
136,13.69,3.26,2.54,20.0,107,1.83,0.56,0.50,0.80,5.88,0.96,1.82,680,2


In [10]:
test_df = pd.read_csv('s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/test_data_wine_pred.csv',nrows=30)
test_df.drop('Unnamed: 0',axis=1,inplace=True)
test_df['target'] = test_df['target'] - 1
test_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.1,2.02,2.4,18.8,103,2.75,2.92,0.32,2.38,6.2,1.07,2.75,1060,0
1,12.04,4.3,2.38,22.0,80,2.1,1.75,0.42,1.35,2.6,0.79,2.57,580,1
2,14.38,1.87,2.38,12.0,102,3.3,3.64,0.29,2.96,7.5,1.2,3.0,1547,0
3,13.07,1.5,2.1,15.5,98,2.4,2.64,0.28,1.37,3.7,1.18,2.69,1020,0
4,11.76,2.68,2.92,20.0,103,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607,1
5,13.87,1.9,2.8,19.4,107,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915,0
6,14.06,1.63,2.28,16.0,126,3.0,3.17,0.24,2.1,5.65,1.09,3.71,780,0
7,11.62,1.99,2.28,18.0,98,3.02,2.26,0.17,1.35,3.25,1.16,2.96,345,1
8,12.42,2.55,2.27,22.0,90,1.68,1.84,0.66,1.42,2.7,0.86,3.3,315,1
9,12.51,1.24,2.25,17.5,85,2.0,0.58,0.6,1.25,5.45,0.75,1.51,650,2


In [11]:
from ydata_profiling import ProfileReport
profile = ProfileReport(train_df)
profile.to_file('profile_report_wine.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
train_df.columns

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline', 'target'],
      dtype='object')

In [13]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming 'data' contains your features
# Extract the feature columns
X = train_df.drop(columns=['target'])

# Add a constant column to the features (required for VIF calculation)
X_with_const = pd.concat([pd.Series(1, index=X.index, name='const'), X], axis=1)

# Create a DataFrame to store the VIF results
vif_data = pd.DataFrame()
vif_data["feature"] = X_with_const.columns

# Calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]

# Print the VIF results
print(vif_data)


                         feature         VIF
0                          const  656.142518
1                        alcohol    2.441252
2                     malic_acid    1.607770
3                            ash    2.114522
4              alcalinity_of_ash    2.213630
5                      magnesium    1.519026
6                  total_phenols    4.347040
7                     flavanoids    7.697181
8           nonflavanoid_phenols    1.820228
9                proanthocyanins    2.091542
10               color_intensity    3.093786
11                           hue    2.750493
12  od280/od315_of_diluted_wines    3.879269
13                       proline    2.739025


In [14]:
#split the data
X_train = train_df.drop(['target'], axis=1)  # drop target and unimportant features
y_train = train_df['target']  # Target variable
X_test = test_df.drop(['target'], axis=1)  # Features
y_test = test_df['target']  # Target variable
print("Train data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Train data shape: (138, 13) (138,)
Test data shape: (30, 13) (30,)


In [15]:
# Separating the numerical and categorical columns
import numpy as np
def data_type(dataset):
    """
    Function to identify the numerical and categorical data columns
    :param dataset: Dataframe
    :return: list of numerical and categorical columns
    """
    numerical = []
    categorical = []
    for i in dataset.columns:
        if dataset[i].dtype == 'int64' or dataset[i].dtype == 'float64':
            numerical.append(i)
        else:
            categorical.append(i)
    return numerical, categorical


numerical, categorical = data_type(X_train)

# Identifying the binary columns and ignoring them from scaling
def binary_columns(df):
    """
    Generates a list of binary columns in a dataframe.
    """
    binary_cols = []
    for col in df.select_dtypes(include=['int', 'float']).columns:
        unique_values = df[col].unique()
        if np.in1d(unique_values, [0, 1]).all():
            binary_cols.append(col)
    return binary_cols

binary_cols = binary_columns(X_train)

# Remove the binary columns from the numerical columns
categorical = [i for i in categorical if i != 'target']

In [16]:
from sklearn.preprocessing import StandardScaler
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Create a pipeline for preprocessing
ct = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),  # StandardScaler for numeric features
        ('cat', CatBoostEncoder(), categorical)  # CatBoostEncoder for categorical features
    ],
    remainder='passthrough'  # Passthrough any columns not specified
)


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Define classifiers
rfc = RandomForestClassifier()
logistic = LogisticRegression(penalty='l2', max_iter=1000)
svm_rbf = SVC(kernel='rbf')
xgb = XGBClassifier()
nn = MLPClassifier()

# Create pipelines for classifiers with the updated ColumnTransformer
rfc_pipeline = Pipeline([
    ("Data Transformations", ct),
    ("Random Forest", rfc)
])

logistic_pipeline = Pipeline([
    ("Data Transformations", ct),
    ("Logistic Regression", logistic)
])

svm_rbf_pipeline = Pipeline([
    ("Data Transformations", ct),
    ("SVM with RBF kernel", svm_rbf)
])

xgb_pipeline = Pipeline([
    ("Data Transformations", ct),
    ("XGBoost", xgb)
])

nn_pipeline = Pipeline([
    ("Data Transformations", ct),
    ("Neural Network", nn)
])

# Fit and use the pipelines as before
rfc_pipeline.fit(X_train, y_train)
logistic_pipeline.fit(X_train, y_train)
svm_rbf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)
nn_pipeline.fit(X_train, y_train)

# After fitting, you can use the pipelines for prediction and evaluation
rfc_predictions = rfc_pipeline.predict(X_test)
logistic_predictions = logistic_pipeline.predict(X_test)
svm_rbf_predictions = svm_rbf_pipeline.predict(X_test)
xgb_predictions = xgb_pipeline.predict(X_test)
nn_predictions = nn_pipeline.predict(X_test)



In [18]:
from sklearn import set_config
set_config(display ="diagram")

In [19]:
xgb_train_accuracy= xgb_pipeline.score(X_train, y_train)
print(f"XG boost Training Accuracy:{xgb_train_accuracy:.4f}")
xgb_test_accuracy= xgb_pipeline.score(X_test, y_test)
print(f"XG boost Test Accuracy:{xgb_test_accuracy:.4f}")

XG boost Training Accuracy:1.0000
XG boost Test Accuracy:1.0000


In [20]:
rfc_train_accuracy= rfc_pipeline.score(X_train, y_train)
print(f"Random Forest Training Accuracy:{rfc_train_accuracy:.4f}")
rfc_test_accuracy= rfc_pipeline.score(X_test, y_test)
print(f"Random Forest Test Accuracy:{rfc_test_accuracy:.4f}")

Random Forest Training Accuracy:1.0000
Random Forest Test Accuracy:1.0000


In [21]:
logistic_train_accuracy= logistic_pipeline.score(X_train, y_train)
print(f"Logistic Ridge Training Accuracy:{logistic_train_accuracy:.4f}")
logistic_test_accuracy= logistic_pipeline.score(X_test, y_test)
print(f"Logistic Ridge Test Accuracy:{logistic_test_accuracy:.4f}")

Logistic Ridge Training Accuracy:1.0000
Logistic Ridge Test Accuracy:1.0000


In [22]:
svm_train_accuracy= svm_rbf_pipeline.score(X_train, y_train)
print(f"SVM Training Accuracy:{svm_train_accuracy:.4f}")
svm_test_accuracy= svm_rbf_pipeline.score(X_test, y_test)
print(f"SVM Test Accuracy:{svm_test_accuracy:.4f}")

SVM Training Accuracy:0.9928
SVM Test Accuracy:1.0000


In [23]:
nn_train_accuracy= nn_pipeline.score(X_train, y_train)
print(f"Neural Network Training Accuracy:{nn_train_accuracy:.4f}")
nn_test_accuracy= nn_pipeline.score(X_test, y_test)
print(f"Neural Network Test Accuracy:{nn_test_accuracy:.4f}")

Neural Network Training Accuracy:1.0000
Neural Network Test Accuracy:0.9667


In [24]:
%%writefile train.py
import argparse
import numpy as np
import os
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import ast
from sklearn.neural_network import MLPClassifier

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--rfc_n_estimators", type=int, default=100)
    parser.add_argument("--rfc_min_samples_split", type=float, default=0.05)
    parser.add_argument("--rfc_criterion", type=str, default="gini")
    parser.add_argument("--logistic_max_iter", type=int, default=500)
    parser.add_argument("--svm_kernel", type=str, default="rbf")
    parser.add_argument("--xgb_max_depth", type=int, default=3)
    parser.add_argument("--nn_hidden_layer_sizes", type=str, default="(100,)")
    args, _ = parser.parse_known_args()
    
    # Read the data
    train_df = pd.read_csv('s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/train_data_wine_pred.csv')  # Path to your train data file
    test_df = pd.read_csv('s3://sagemaker-us-east-1-635439539142/akash/wine-prediction/test_data_wine_pred.csv',nrows=50)   # Path to your test data file
    
    # Convert target variable to numerical labels starting from 0
    label_encoder = LabelEncoder()
    train_df['target'] = label_encoder.fit_transform(train_df['target'])
    test_df['target'] = label_encoder.transform(test_df['target'])
    
    X_train = train_df.drop("target", axis=1)
    y_train = train_df["target"]
    X_test = test_df.drop("target", axis=1)
    y_test = test_df["target"] 
    
    def data_type(dataset):
        """
        Function to identify the numerical and categorical data columns
        :param dataset: Dataframe
        :return: list of numerical and categorical columns
        """
        numerical = []
        categorical = []
        for i in dataset.columns:
            if dataset[i].dtype == 'int64' or dataset[i].dtype == 'float64':
                numerical.append(i)
            else:
                categorical.append(i)
        return numerical, categorical

    numerical, categorical = data_type(X_train)
    
    # Identifying the binary columns and ignoring them from scaling
    def binary_columns(df):
        """
        Generates a list of binary columns in a dataframe.
        """
        binary_cols = []
        for col in df.select_dtypes(include=['int', 'float']).columns:
            unique_values = df[col].unique()
            if np.in1d(unique_values, [0, 1]).all():
                binary_cols.append(col)
        return binary_cols

    binary_cols = binary_columns(X_train)

    # Remove the binary columns from the numerical columns
    numerical = [i for i in numerical if i not in binary_cols]
    
    # Define your encoder
    ct = ColumnTransformer([
        ("CatBoostEncoding", CatBoostEncoder(), categorical),
        ("Scaling", StandardScaler(), numerical)
    ])
    
    # Define classifiers
    rfc = RandomForestClassifier(n_estimators=args.rfc_n_estimators, 
                                  min_samples_split=args.rfc_min_samples_split, 
                                  criterion=args.rfc_criterion)
    
    logistic = LogisticRegression(penalty='l2', max_iter=args.logistic_max_iter)
    
    svm_rbf = SVC(kernel=args.svm_kernel)
    
    xgb_classifier = xgb.XGBClassifier(max_depth=args.xgb_max_depth)

    # Convert hidden_layer_sizes from string to tuple of integers
    hidden_layer_sizes = ast.literal_eval(args.nn_hidden_layer_sizes)
    
    nn_classifier = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes)
    
    # Create pipelines for classifiers with your encoder
    rfc_pipeline = Pipeline([
        ("Data Transformations", ct),
        ("Random Forest", rfc)
    ])

    logistic_pipeline = Pipeline([
        ("Data Transformations", ct),
        ("Logistic Regression", logistic)
    ])

    svm_rbf_pipeline = Pipeline([
        ("Data Transformations", ct),
        ("SVM with RBF kernel", svm_rbf)
    ])
    
    xgb_pipeline = Pipeline([
        ("Data Transformations", ct),
        ("XGBoost Classifier", xgb_classifier)
    ])
    
    nn_pipeline = Pipeline([
        ("Data Transformations", ct),
        ("Neural Network Classifier", nn_classifier)
    ])
    
    # Fit and evaluate each pipeline
    for pipeline, name in [(rfc_pipeline, 'Random Forest'), 
                           (logistic_pipeline, 'Logistic Regression'), 
                           (svm_rbf_pipeline, 'SVM with RBF kernel'),
                           (xgb_pipeline, 'XGBoost Classifier'),
                           (nn_pipeline, 'Neural Network Classifier')]:
        pipeline.fit(X_train, y_train)
        train_accuracy = pipeline.score(X_train, y_train)
        test_accuracy = pipeline.score(X_test, y_test)
        print(f"{name} Training Accuracy: {test_accuracy:.4f}")
        
        # Save the model
        model_save_path = os.path.join(args.model_dir, f"{name.lower().replace(' ', '_')}_model.joblib")
        joblib.dump(pipeline, model_save_path)
        print(f"Model Saved At: {model_save_path}")

if __name__ == "__main__":
    main()

Overwriting train.py


In [29]:
%%writefile requirements.txt
pandas
xgboost
scikit-learn
fsspec
category_encoders
s3fs
botocore==1.27.18

Overwriting requirements.txt


In [30]:
!python train.py --model_dir ../models/ --rfc_n_estimators 100 --rfc_min_samples_split 0.05 --rfc_criterion gini --logistic_max_iter 1000 --svm_kernel rbf --xgb_max_depth 3 --nn_hidden_layer_sizes !python train.py --model_dir ../models/ --rfc_n_estimators 100 --rfc_min_samples_split 0.05 --rfc_criterion gini --logistic_max_iter 1000 --svm_kernel rbf --xgb_max_depth 3 --nn_hidden_layer_sizes "(25,11,75,5,3,100)"

Random Forest Training Accuracy: 1.0000
Model Saved At: ../models/random_forest_model.joblib
Logistic Regression Training Accuracy: 1.0000
Model Saved At: ../models/logistic_regression_model.joblib
SVM with RBF kernel Training Accuracy: 0.9750
Model Saved At: ../models/svm_with_rbf_kernel_model.joblib
XGBoost Classifier Training Accuracy: 0.9750
Model Saved At: ../models/xgboost_classifier_model.joblib
Neural Network Classifier Training Accuracy: 0.9500
Model Saved At: ../models/neural_network_classifier_model.joblib


In [31]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

sklearn_estimator = SKLearn(
    base_job_name="pipeline-run",
    framework_version="0.23-1",
    entry_point="train.py",
    dependencies=["requirements.txt"],  # Include the requirements file if needed
    hyperparameters={
        "rfc_n_estimators": 100,
        "rfc_min_samples_split": 0.05,
        "rfc_criterion": "gini",
        "logistic_max_iter": 200,
        "svm_kernel": "rbf",
        "xgb_max_depth": 3,
        "nn_hidden_layer_sizes": "(25,11,75,5,3,100)"  # Pass as a string representing a tuple
    },
    instance_count=1,
    instance_type="ml.m5.large",
    use_spot_instances=True,
    max_wait=600,
    max_run=600,
    role=get_execution_role(),
)

sklearn_estimator.fit()


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: pipeline-run-2024-03-03-14-17-27-149


Using provided s3_resource
2024-03-03 14:17:27 Starting - Starting the training job...
2024-03-03 14:17:42 Starting - Preparing the instances for training...
2024-03-03 14:18:20 Downloading - Downloading input data...
2024-03-03 14:18:41 Downloading - Downloading the training image...
2024-03-03 14:19:21 Training - Training image download completed. Training in progress..[34m2024-03-03 14:19:24,509 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-03-03 14:19:24,512 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-03 14:19:24,547 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-03-03 14:19:24,722 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)[

In [32]:
import boto3
sm_client = boto3.client("sagemaker")
training_job_name = sklearn_estimator.latest_training_job.name
model_artifact = sm_client.describe_training_job(
    TrainingJobName = training_job_name
)["ModelArtifacts"]["S3ModelArtifacts"]

print(f"Training job name : {training_job_name}")
print(f"Model storage location : {model_artifact}")

Training job name : pipeline-run-2024-03-03-14-17-27-149
Model storage location : s3://sagemaker-us-east-1-635439539142/pipeline-run-2024-03-03-14-17-27-149/output/model.tar.gz


In [39]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
import itertools

# Define role
role = sagemaker.get_execution_role()
# Define the SKLearn estimator with the modified train.py script
sklearn_estimator = SKLearn(
    entry_point="train.py",  # Make sure this points to the modified train.py script
    source_dir=".",  # Directory containing your training script and dependencies
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    framework_version="0.23-1"
)

# Define hyperparameter ranges
hyperparameter_ranges = {
    "rfc_n_estimators": IntegerParameter(50, 150),
    "rfc_min_samples_split": ContinuousParameter(0.01, 0.5),
    "rfc_criterion": CategoricalParameter(["gini", "entropy"]),
    "logistic_max_iter": IntegerParameter(100, 1000),
    "svm_kernel": CategoricalParameter(["linear", "poly", "rbf", "sigmoid"]),
    "xgb_max_depth": IntegerParameter(3, 10),
    "nn_hidden_layer_sizes": CategoricalParameter([(25,50,75,100),(10,20,30,40,50),(60,70,80,90,100)])
}

# Define the objective metric name and type
objective_metric_name = 'Training_Accuracy'
objective_type = 'Maximize'

# Define the metric definitions function
def generate_metric_definitions():
    return [
        {
            "Name": "Training_Accuracy",
            "Regex": "[a-zA-Z].*\\s+Training\\s+Accuracy:\\s+([0-9\\.]+)"
        },
        {
            "Name": "Test_Accuracy",
            "Regex": "[a-zA-Z].*\\s+Test\\s+Accuracy:\\s+([0-9\\.]+)"
        }
    ]

# Define the HyperparameterTuner
tuner = HyperparameterTuner(
    base_tuning_job_name='pipeline-run',
    estimator=sklearn_estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    objective_type=objective_type,
    metric_definitions=generate_metric_definitions(),
    max_jobs=10,
    max_parallel_jobs=2
)

# Launch the hyperparameter tuning job
tuner.fit()


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating hyperparameter tuning job with name: pipeline-run-240303-1434


Using provided s3_resource
............................................................................................................!


In [69]:
tuning_job_name = tuner.latest_tuning_job.name
tuning_job_name

'pipeline-run-240303-1434'

In [42]:
best_hyperparameters = tuner.best_estimator().hyperparameters()
best_hyperparameters


2024-03-03 14:37:52 Starting - Preparing the instances for training
2024-03-03 14:37:52 Downloading - Downloading the training image
2024-03-03 14:37:52 Training - Training image download completed. Training in progress.
2024-03-03 14:37:52 Uploading - Uploading generated training model
2024-03-03 14:37:52 Completed - Resource reused by training job: pipeline-run-240303-1434-004-d9bf51bc


{'_tuning_objective_metric': '"Training_Accuracy"',
 'logistic_max_iter': '944',
 'nn_hidden_layer_sizes': '"(10, 20, 30, 40, 50)"',
 'rfc_criterion': '"entropy"',
 'rfc_min_samples_split': '0.23860113313287065',
 'rfc_n_estimators': '108',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"SKLearn"',
 'sagemaker_estimator_module': '"sagemaker.sklearn.estimator"',
 'sagemaker_job_name': '"sagemaker-scikit-learn-2024-03-03-14-34-22-586"',
 'sagemaker_program': '"train.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-635439539142/sagemaker-scikit-learn-2024-03-03-14-34-22-586/source/sourcedir.tar.gz"',
 'svm_kernel': '"poly"',
 'xgb_max_depth': '10'}

In [44]:
tuner_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
tuning_metrics = tuner_analytics.dataframe()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [70]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style for the plots
sns.set(style="whitegrid")

# Plot hyperparameters versus objective metric using Seaborn
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
axes = axes.flatten()

for i, (hyperparameter, ax) in enumerate(zip(hyperparameters.columns, axes)):
    sns.scatterplot(data=tuning_metrics, x=hyperparameter, y='FinalObjectiveValue', ax=ax, alpha=0.5)
    ax.set_xlabel(hyperparameter)
    ax.set_ylabel('Final Objective Value')
    ax.set_title(f'{hyperparameter} vs. Final Objective Value')

plt.tight_layout()
plt.show()
