In [3]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import pandas as pd

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Get the IAM role for SageMaker
role = get_execution_role()

# Define S3 bucket and file paths
bucket_name = 'pawtracker-processed-data'  # Replace with your bucket name
file_key = 'processed_dog_health_vitals.csv'  # Replace with your file key
local_file_path = 'processed_dog_health_vitals.csv'

# Download the processed dataset from S3
s3 = boto3.client('s3')
s3.download_file(bucket_name, file_key, local_file_path)

print("Dataset downloaded successfully!")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Dataset downloaded successfully!


In [4]:
# Load the dataset into a Pandas DataFrame
df = pd.read_csv(local_file_path)

# Display basic information about the dataset
print(df.info())

# Display the first few rows of the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 968 entries, 0 to 967
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   _id                 968 non-null    object 
 1   ecg_path            968 non-null    object 
 2   duration            968 non-null    float64
 3   pet_id              968 non-null    int64  
 4   breeds              968 non-null    object 
 5   weight              968 non-null    float64
 6   age                 968 non-null    float64
 7   segments_br         968 non-null    object 
 8   segments_hr         968 non-null    object 
 9   ecg_pulses          968 non-null    object 
 10  bad_ecg             968 non-null    object 
 11  first_br_value      968 non-null    float64
 12  first_hr_value      968 non-null    float64
 13  ecg_mean            968 non-null    float64
 14  ecg_max             968 non-null    float64
 15  ecg_min             968 non-null    float64
 16  total_ba

In [5]:
# Features (X) and target variable (y)
X = df.drop(columns=['first_hr_value'])  # Replace 'first_hr_value' with your target column
y = df['first_hr_value']

# Split into training and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (774, 16)
Test set size: (194, 16)


In [6]:
# Save training and test sets locally
train_file_path = 'train.csv'
test_file_path = 'test.csv'

pd.concat([X_train, y_train], axis=1).to_csv(train_file_path, index=False)
pd.concat([X_test, y_test], axis=1).to_csv(test_file_path, index=False)

# Upload to S3
train_s3_path = f"s3://{bucket_name}/train/train.csv"
test_s3_path = f"s3://{bucket_name}/test/test.csv"

s3.upload_file(train_file_path, bucket_name, 'train/train.csv')
s3.upload_file(test_file_path, bucket_name, 'test/test.csv')

print("Training and test data uploaded to S3.")

Training and test data uploaded to S3.


In [7]:
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Define the container for the XGBoost algorithm
container = sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.7-1")

# Define the estimator
xgb_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket_name}/output",  # Where SageMaker saves the trained model
    sagemaker_session=sagemaker_session
)

# Set hyperparameters
xgb_estimator.set_hyperparameters(
    objective="reg:squarederror",  # Regression task
    num_round=100,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=1
)

In [8]:
import boto3

# Initialize the S3 client
s3 = boto3.client('s3')

# Specify the bucket name
bucket_name = 'pawtracker-processed-data'

try:
    # Get the bucket's region
    response = s3.get_bucket_location(Bucket=bucket_name)
    bucket_region = response['LocationConstraint']

    # If the region is None, it means the bucket is in the default region (us-east-1)
    if bucket_region is None:
        bucket_region = 'us-east-1'

    print(f"The bucket '{bucket_name}' is located in the region: {bucket_region}")
except Exception as e:
    print(f"Error retrieving bucket location: {e}")

The bucket 'pawtracker-processed-data' is located in the region: us-east-1


In [9]:
# Define S3 inputs for training
train_input = TrainingInput(train_s3_path, content_type="csv")
test_input = TrainingInput(test_s3_path, content_type="csv")

# Train the model
xgb_estimator.fit({"train": train_input, "validation": test_input})

2025-02-01 02:02:22 Starting - Starting the training job...
..25-02-01 02:02:37 Starting - Preparing the instances for training.
..25-02-01 02:03:24 Downloading - Downloading input data.
....[34m[2025-02-01 02:04:40.531 ip-10-0-204-218.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-02-01 02:04:40.555 ip-10-0-204-218.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-02-01:02:04:40:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-02-01:02:04:40:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-02-01:02:04:40:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-02-01:02:04:40:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-02-01:02:04:40:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2025-02-01:02:04:40:INFO] Determined delimiter of CSV input is ','[0m


In [10]:
# Deploy the model
predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

print("Model deployed successfully!")

------!Model deployed successfully!


In [16]:
# import numpy as np
# import pandas as pd

# # Step 1: Inspect the columns in X_test
# print("Columns in X_test:", X_test.columns)

# # Step 2: Drop irrelevant columns
# irrelevant_columns = ['_id', 'ecg_path']
# X_test = X_test.drop(columns=[col for col in irrelevant_columns if col in X_test.columns])

# # Step 3: Handle existing columns
# if 'first_br_value' in X_test.columns:
#     X_test['first_br_value'] = pd.to_numeric(X_test['first_br_value'], errors='coerce')
#     X_test['first_br_value'] = X_test['first_br_value'].fillna(X_test['first_br_value'].median())

# if 'first_hr_value' in X_test.columns:
#     X_test['first_hr_value'] = pd.to_numeric(X_test['first_hr_value'], errors='coerce')
#     X_test['first_hr_value'] = X_test['first_hr_value'].fillna(X_test['first_hr_value'].median())

# # Step 4: Ensure all data is numeric
# X_test = X_test.apply(pd.to_numeric, errors='coerce')
# X_test = X_test.fillna(X_test.median())

# # Step 5: Serialize the input data
# input_data = X_test.iloc[0].values.astype('float32')  # Extract the first row as a NumPy array
# csv_data = ','.join(map(str, input_data))  # Convert to a CSV string

# print(f"Serialized Input Data: {csv_data}")

# # Step 6: Make the prediction
# response = predictor.predict(csv_data)
# prediction = response.decode('utf-8')  # Decode the response from bytes to string
# print(f"Predicted Heart Rate: {prediction}")

Columns in X_test: Index(['duration', 'weight', 'age', 'first_br_value', 'ecg_mean', 'ecg_max',
       'ecg_min', 'total_bad_duration', 'first_hr_value'],
      dtype='object')
Serialized Input Data: 0.25600338,0.5151515,0.11764706,14.54,150.19289,299.34,0.67,26.0,65.76
