In [5]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_svmlight_file
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

# Define the SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()

# Load the libsvm formatted data
X_train, y_train = load_svmlight_file('train.libsvm')
X_test, y_test = load_svmlight_file('test.libsvm')

# Upload the libsvm data to S3
prefix = 'xgboost-classification'
train_data_location = sagemaker_session.upload_data(path='train.libsvm', key_prefix=prefix + '/input/train')
test_data_location = sagemaker_session.upload_data(path='test.libsvm', key_prefix=prefix + '/input/test')

# Get the XGBoost container
container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')

# Define the estimator
estimator = sagemaker.estimator.Estimator(container,
                                          role,
                                          train_instance_count=1,
                                          train_instance_type='ml.m4.xlarge',
                                          output_path='s3://{}/{}/output'.format(sagemaker_session.default_bucket(), prefix))

# Set hyperparameters
estimator.set_hyperparameters(max_depth=5,
                              eta=0.2,
                              gamma=4,
                              min_child_weight=6,
                              subsample=0.8,
                              silent=0,
                              objective='multi:softmax',
                              num_class=4,
                              num_round=1000)

# Train the model
estimator.fit({'train': train_data_location})

# Deploy the model
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge')

# Make predictions
predictions = predictor.predict(data=X_test)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: xgboost-2024-04-21-06-17-35-612


2024-04-21 06:17:35 Starting - Starting the training job...
2024-04-21 06:18:02 Starting - Preparing the instances for training......
2024-04-21 06:19:01 Downloading - Downloading input data...
2024-04-21 06:19:25 Downloading - Downloading the training image......
2024-04-21 06:20:21 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-04-21:06:20:32:INFO] Running standalone xgboost training.[0m
[34m[2024-04-21:06:20:32:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2024-04-21:06:20:32:INFO] File size need to be processed in the node: 0.15mb. Available memory size in the node: 8485.48mb[0m
[34m[06:20:32] S3DistributionType set as FullyReplicated[0m
[34m[06:20:32] 1600x21 matrix with 27062 entries loaded from /opt/ml/input/data/train[0m
[34m[06:20:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 12 pruned nodes, max_depth=4[0m
[34m[06:20:32] src/tree/updater_prune.cc:74: tree pr


2024-04-21 06:20:58 Uploading - Uploading generated training model
2024-04-21 06:20:58 Completed - Training job completed


INFO:sagemaker:Creating model with name: xgboost-2024-04-21-06-21-18-042


Training seconds: 118
Billable seconds: 118


INFO:sagemaker:Creating endpoint-config with name xgboost-2024-04-21-06-21-18-042
INFO:sagemaker:Creating endpoint with name xgboost-2024-04-21-06-21-18-042


------!

ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value:   (0, 0)	1646.0
  (0, 2)	2.5
  (0, 4)	3.0
  (0, 5)	1.0
  (0, 6)	25.0
  (0, 7)	0.6
  (0, 8)	200.0
  (0, 9)	2.0
  (0, 10)	5.0
  (0, 11)	211.0
  (0, 12)	1608.0
  (0, 13)	686.0
  (0, 14)	8.0
  (0, 15)	6.0
  (0, 16)	11.0
  (0, 17)	1.0
  (0, 18)	1.0
  (1, 0)	1182.0
  (1, 2)	0.5
  (1, 4)	7.0
  (1, 5)	1.0
  (1, 6)	8.0
  (1, 7)	0.5
  (1, 8)	138.0
  (1, 9)	8.0
  :	:
  (398, 11)	526.0
  (398, 12)	1529.0
  (398, 13)	2039.0
  (398, 14)	5.0
  (398, 15)	1.0
  (398, 16)	12.0
  (398, 17)	1.0
  (398, 18)	1.0
  (398, 19)	1.0
  (399, 0)	1185.0
  (399, 2)	1.9
  (399, 6)	31.0
  (399, 7)	0.4
  (399, 8)	152.0
  (399, 9)	8.0
  (399, 10)	7.0
  (399, 11)	837.0
  (399, 12)	1642.0
  (399, 13)	2447.0
  (399, 14)	16.0
  (399, 15)	2.0
  (399, 16)	3.0
  (399, 17)	1.0
  (399, 18)	1.0
  (399, 19)	1.0, type: <class 'scipy.sparse._csr.csr_matrix'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split

# Read the CSV data
df = pd.read_csv('train.csv')

# Separate features and target
X = df.drop(columns=['price_range'])
y = df['price_range']

# Split the data into train and test sets (optional)
# You can skip this step if you want to use the entire dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the DataFrame into the required libsvm format
dump_svmlight_file(X_train, y_train, 'train.libsvm', zero_based=False)
dump_svmlight_file(X_test, y_test, 'test.libsvm', zero_based=False)

# Now, you have 'train.libsvm' and 'test.libsvm' files in libsvm format.


In [1]:
import pandas as pd
# Load the training data into a DataFrame for exploration
train_df = pd.DataFrame(X_train.toarray())

# Display the first few rows of the DataFrame
print(train_df.head())

# Display summary statistics of the DataFrame
print(train_df.describe())

# Check for missing values
print(train_df.isnull().sum())

# Visualize the distribution of target classes
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.countplot(y_train)
plt.title('Distribution of Target Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Correlation heatmap
correlation_matrix = train_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


NameError: name 'X_train' is not defined