In [56]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import time
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import boto3
import botocore
from botocore.exceptions import ClientError
from pathlib import Path

import csv
import io
import re
import s3fs

import seaborn as sns
import pickle
import gzip
import urllib
import csv

import sagemaker
from sagemaker import PCA
from sagemaker.session import Session                              
from sagemaker import get_execution_role


In [57]:
sagemaker_session = sagemaker.Session()
region = boto3.Session().region_name
bucket_name = 'sagemaker-eu-west-1-688567281415'
role = 'arn:aws:iam::688567281415:role/service-role/AmazonSageMaker-ExecutionRole-20240913T093672'
print('Sagemaker session :', sagemaker_session)
print('S3 bucket :', bucket_name)
print('Region selected :', region)
print('IAM role :', role)

Sagemaker session : <sagemaker.session.Session object at 0x0000025C5AE7A4E0>
S3 bucket : sagemaker-eu-west-1-688567281415
Region selected : eu-west-1
IAM role : arn:aws:iam::688567281415:role/service-role/AmazonSageMaker-ExecutionRole-20240913T093672


In [58]:
# Print current directory
print(f"Current working directory: {os.getcwd()}")

# Go up one level from current directory
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# Now try importing
from utils.bitget_futures import BitgetFutures
# Print current directory
print(f"Current working directory: {os.getcwd()}")

Current working directory: C:\Users\Barticus\PycharmProjects\CDP_BlockChain\Machine1
Current working directory: C:\Users\Barticus\PycharmProjects\CDP_BlockChain\Machine1


In [59]:
    def fetch_data(self):
        """Fetch historical data from Bitget"""
        try:
            if not self.bitget_client:
                raise ValueError("Bitget client not provided in config")
            
            # Use a direct approach that works with daily timeframes
            start_date = (pd.Timestamp.now() - pd.Timedelta(days=400)).strftime('%Y-%m-%d')
            
            print(f"Fetching data from {start_date} for {self.symbol}")
            
            # Get the client from config and access correct properties
            data = self.bitget_client.fetch_ohlcv(
                symbol=self.symbol,
                timeframe=self.timeframe,
                start_time=start_date
            )
            
            logging.info(f"Fetched {len(data)} candles for {self.symbol}")
            return data
            
        except Exception as e:
            print(f"Error fetching data: {str(e)}")
            raise

In [60]:
    def calculate_features(self, data):
        """Calculate required features for PCA analysis"""
        from utils.feature_calculator import calculate_all_features
        
        # Calculate all available features
        features_df = calculate_all_features(data.copy())
        
        # Handle any NaN values - replace with 0
        features_df = features_df.fillna(0)
        
        return selected_df

In [62]:

from sklearn.preprocessing import StandardScaler
path = "s3://sagemaker-eu-west-1-688567281415/25_03_11/PCA_14_07.csv"
ohlcv = pd.read_csv(path, header=0, delimiter=",", low_memory=False)
ohlcv.dropna(inplace=True)
print(ohlcv.shape)

ohlcv = ohlcv.astype(float)

(398, 9)


In [48]:
# 1. Fix column names with meaningful labels
ohlcv.columns = ['feature1', 'feature2', 'feature3', 'feature4', 
                'feature5', 'feature6', 'feature7', 'feature8', 'volume']



In [69]:
n_components=5
# Initialize PCA estimator with larger instance and no spot
job_name = f"pca-test-{int(time.time())}"
pca_estimator = PCA(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    num_components=n_components,
    sagemaker_session=sagemaker_session,
    output_path=f"s3://{bucket_name}/pca_output",
    base_job_name=job_name
)

In [82]:
# Scale the data (crucial for PCA)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(ohlcv.values)

# 3. Create a new DataFrame with scaled values and proper column names
scaled_df = pd.DataFrame(scaled_data, columns=ohlcv.columns)

train_data = scaled_df.values.astype('float32')

# Upload to S3
s3_client = boto3.client('s3')
s3_client.upload_file(local_file, bucket_name, s3_key)
print(f"Data uploaded to s3://{bucket_name}/{s3_key}")

# Verify the CSV file contents before proceeding
with open(local_file, 'r') as f:
    first_line = f.readline().strip()
    actual_dims = len(first_line.split(','))
    print(f"CSV file has {actual_dims} columns per row")

# 2. Now create a SageMaker training job directly with boto3
sm_client = boto3.client('sagemaker')

# Get the PCA algorithm container
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, "pca")
print(f"Using container: {container}")

# Create a unique job name
job_name = f"pca-direct2-{timestamp}"

# The feature dimension must match the actual number of columns in the CSV
feature_dim = actual_dims  # Use the verified dimension from the CSV
print(f"Setting feature_dim to {feature_dim} based on actual CSV format")

# Create the training job with proper configuration
response = sm_client.create_training_job(
    TrainingJobName=job_name,
    AlgorithmSpecification={
        'TrainingImage': container,
        'TrainingInputMode': 'File'
    },
    RoleArn=role,
    InputDataConfig=[
        {
            'ChannelName': 'train',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': f"s3://{bucket_name}/{s3_prefix}/",
                    'S3DataDistributionType': 'FullyReplicated'
                }
            },
            'ContentType': 'text/csv',
            'CompressionType': 'None'
        }
    ],
    OutputDataConfig={
        'S3OutputPath': f"s3://{bucket_name}/pca-output/{timestamp}/"
    },
    ResourceConfig={
        'InstanceType': 'ml.m5.large',
        'InstanceCount': 1,
        'VolumeSizeInGB': 30
    },
    HyperParameters={
        'num_components': str(n_components),
        'feature_dim': str(feature_dim),  # Use the verified dimension
        'mini_batch_size': str(min(100, train_data.shape[0]))
    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 300
    }
)

print(f"Training job created: {job_name}")

Data uploaded to s3://sagemaker-eu-west-1-688567281415/pca-input/20250312-145348/pca_data.csv
CSV file has 9 columns per row


Using container: 438346466558.dkr.ecr.eu-west-1.amazonaws.com/pca:1
Setting feature_dim to 9 based on actual CSV format
Training job created: pca-direct2-20250312-145348


In [79]:
print("Starting training job...")
pca_estimator.fit(record_set, logs=True)

Starting training job...



KeyboardInterrupt

