# 04: Model Training Pipeline (XGBoost GPU)

## Step 1: Load Data
Make sure your CSV file is uploaded to the Colab environment.

In [None]:
import cudf
import os

# Auto-detect uploaded CSV if not provided
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
if csv_files:
    filename = csv_files[0]
    df = cudf.read_csv(filename)
    print(f"‚úÖ Loaded: {filename} ({len(df):,} rows)")
else:
    print("‚ùå No CSV found. Please upload a file first.")

## Step 2: Prepare Target Variable
Specify which column you want to predict.

In [None]:
target_column = df.columns[-1] # Default to last column
print(f"Target column: '{target_column}'")

X = df.drop(columns=[target_column])
y = df[target_column]

# Handle categorical data for GPU
X = cudf.get_dummies(X)
print(f"Features after encoding: {X.shape[1]}")

## Step 3: Train-Test Split (on GPU)

In [None]:
from cuml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

## Step 4: Train XGBoost on GPU

In [None]:
import xgboost as xgb
from cuml.metrics import accuracy_score, r2_score

# Detect task type
is_classification = y.nunique() < 10

if is_classification:
    model = xgb.XGBClassifier(
        tree_method='gpu_hist', 
        predictor='gpu_predictor',
        use_label_encoder=False,
        eval_metric='logloss'
    )
    metric_name = "Accuracy"
else:
    model = xgb.XGBRegressor(
        tree_method='gpu_hist', 
        predictor='gpu_predictor'
    )
    metric_name = "R2 Score"

print(f"üöÄ Training XGBoost on GPU ({'Classification' if is_classification else 'Regression'})...")
model.fit(X_train, y_train)
print("‚úÖ Training Complete!")

## Step 5: Evaluate Performance

In [None]:
preds = model.predict(X_test)

if is_classification:
    score = accuracy_score(y_test, preds)
else:
    score = r2_score(y_test, preds)

print(f"üèÜ Model {metric_name}: {score:.4f}")