In [1]:
!pip install pyathena

Collecting pyathena
  Downloading pyathena-3.19.0-py3-none-any.whl.metadata (6.3 kB)
Downloading pyathena-3.19.0-py3-none-any.whl (113 kB)
Installing collected packages: pyathena
Successfully installed pyathena-3.19.0


In [25]:
import pandas as pd
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
import boto3
from pyathena import connect
from sklearn.model_selection import train_test_split
import numpy as np
from sagemaker.serializers import NumpySerializer
from sagemaker.deserializers import JSONDeserializer 


In [26]:
sagemaker_session = sagemaker.Session()
boto_session = boto3.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
region = boto_session.region_name

# --- 2. Athena Configuration ---
ATHENA_DATABASE = "proyecto_tph"
ATHENA_RESULTS_S3 = f"s3://proyecto-final-cc-grupo4/athena_query_results/"

In [27]:
sql_query = """
SELECT
    m.velocidad_sag_rpm,
    m.flujo_de_agua_m3_h,
    n.ugm1,
    n.ugm2,
    n.ugm3,
    n.porc_grueso,
    n.porc_intermedio,
    n.porc_fino,
    m.rendimiento_t_h  -- Target
FROM
    "proyecto_tph"."molienda" m
JOIN
    "proyecto_tph"."minado" n
ON
    m.registro = n.registro
"""

print("Connecting to Athena...")
conn = connect(s3_staging_dir=ATHENA_RESULTS_S3, region_name=region)
df_full = pd.read_sql(sql_query, conn)
print(f"Data successfully loaded. Shape: {df_full.shape}")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Connecting to Athena...


  df_full = pd.read_sql(sql_query, conn)


Data successfully loaded. Shape: (112, 9)


In [28]:
target_col = "rendimiento_t_h"
X = df_full.drop(target_col, axis=1)
y = df_full[target_col]
print(X.head())

   velocidad_sag_rpm  flujo_de_agua_m3_h   ugm1   ugm2   ugm3  porc_grueso  \
0               11.3               459.2  56.76  27.74  15.50        30.93   
1               11.1               506.6  58.47  25.01  16.52        33.87   
2               11.1               481.1  69.32  15.65  15.03        35.12   
3               11.2               457.8  64.60  26.13   9.27        40.26   
4               11.5               440.9  61.94  21.67  16.39        38.14   

   porc_intermedio  porc_fino  
0            19.15      49.91  
1            14.69      51.45  
2            15.80      49.08  
3            14.63      45.11  
4            16.62      45.24  


In [29]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_df = pd.concat([train_y, train_X], axis=1)
test_df = pd.concat([test_y, test_X], axis=1)

In [30]:
train_df.to_csv("train.csv", header=True, index=False)
test_df.to_csv("test.csv", header=True, index=False)

s3_prefix = "byos-model-data"
train_s3_path = sagemaker_session.upload_data("train.csv", bucket=default_bucket, key_prefix=f"{s3_prefix}/train")
test_s3_path = sagemaker_session.upload_data("test.csv", bucket=default_bucket, key_prefix=f"{s3_prefix}/test")

print(f"Training data uploaded to: {train_s3_path}")
print(f"Test data uploaded to: {test_s3_path}")

Training data uploaded to: s3://sagemaker-us-east-1-269479581989/byos-model-data/train/train.csv
Test data uploaded to: s3://sagemaker-us-east-1-269479581989/byos-model-data/test/test.csv


In [31]:
rf_estimator = SKLearn(
    entry_point="train_rf.py",
    #source_dir="source_scripts",
    framework_version="1.2-1",        
    instance_type="ml.m5.large",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={                 
        "n-estimators": 1000,
        "random-state": 42,
        "n_jobs": -1
    }
)


print("Starting Random Forest training job...")
rf_estimator.fit({"train": train_s3_path})
print("Training complete.")

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2025-11-04-01-23-07-611


Starting Random Forest training job...
2025-11-04 01:23:09 Starting - Starting the training job...
2025-11-04 01:23:25 Starting - Preparing the instances for training...
2025-11-04 01:23:52 Downloading - Downloading input data...
2025-11-04 01:24:22 Downloading - Downloading the training image......
2025-11-04 01:25:38 Training - Training image download completed. Training in progress.
  import pkg_resources[0m
[34m2025-11-04 01:25:32,419 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-11-04 01:25:32,424 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-11-04 01:25:32,427 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-11-04 01:25:32,445 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-11-04 01:25:32,746 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m


In [32]:
# 3. --- Deploy the Endpoint ---
print("Deploying Random Forest endpoint...")
rf_predictor = rf_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="rf-model-endpoint"
)
print(f"Endpoint '{rf_predictor.endpoint_name}' is now live.")

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-11-04-01-26-53-114


Deploying Random Forest endpoint...


INFO:sagemaker:Creating endpoint-config with name rf-model-endpoint
INFO:sagemaker:Creating endpoint with name rf-model-endpoint


------!Endpoint 'rf-model-endpoint' is now live.


In [14]:
test_X_np = test_X.to_numpy()
test_y_np = test_y.to_numpy()

# Test Random Forest Predictor 
rf_predictor.serializer = NumpySerializer()      # Serializer is fine (sends npy)
rf_predictor.deserializer = JSONDeserializer() # <-- CHANGED (requests json)

rf_preds = rf_predictor.predict(test_X_np)
print(f"\n--- Random Forest Predictions (first 5) ---")
print(rf_preds[0:5])

print(f"Actual Values (first 5):")
print(test_y_np[0:5])


--- Random Forest Predictions (first 5) ---
[1208.528, 1233.63, 1235.447, 1087.182, 1197.614]

--- Actual Values (first 5) ---
[1211 1230 1257 1072 1218]


In [1]:
print(f"Deleting Random Forest endpoint: {rf_predictor.endpoint_name}")
rf_predictor.delete_model()
rf_predictor.delete_endpoint()

print("Endpoint and model deleted.")

NameError: name 'rf_predictor' is not defined