# GPU-Accelerated Data Science Agents - Environment Setup

## Step 1: Check GPU Availability

In [None]:
!nvidia-smi

Sun Dec 21 04:00:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Step 2: Install RAPIDS

RAPIDS provides GPU-accelerated data science libraries (cuDF, cuML, etc.)

In [None]:
# Install condacolab to set up a conda environment
!pip install -q condacolab
import condacolab
condacolab.install()

# Install RAPIDS packages with mamba
# This will take a few minutes.
# We're installing cudf, cuml, and rmm compatible with CUDA 12 and Python 3.12 (current Colab version after condacolab).
!mamba install -q -c conda-forge -c rapidsai -c nvidia \
    cudf==24.06 cuml==24.06 rmm==24.06 python=3.12 \
    --yes

‚ú®üç∞‚ú® Everything looks OK!
Your pinning does not match what's currently installed. Please remove the pin and fix your installation
  Pin: python=3.12
  Currently installed: conda-forge/linux-64::python==3.11.11=h9e4cc4f_1_cpython


## Step 3: Install Additional Dependencies

In [None]:
# Install other required packages
!pip install -q xgboost plotly seaborn scikit-learn imbalanced-learn py3nvml gpustat

## Step 4: Verify Installation

In [None]:
import sys
import cudf
import cuml
import xgboost as xgb
import torch

print("‚úÖ Installation Successful!\n")
print(f"Python version: {sys.version}")
print(f"cuDF version: {cudf.__version__}")
print(f"cuML version: {cuml.__version__}")
print(f"XGBoost version: {xgb.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

‚úÖ Installation Successful!

Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
cuDF version: 25.10.00
cuML version: 25.10.00
XGBoost version: 3.1.2
PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
GPU Memory: 15.83 GB


## Step 5: Test cuDF

In [None]:
import cudf
import numpy as np

# Create a sample cuDF DataFrame
df = cudf.DataFrame({
    'a': np.random.rand(1000000),
    'b': np.random.rand(1000000),
    'c': np.random.randint(0, 100, 1000000)
})

print("Sample cuDF DataFrame:")
print(df.head())
print(f"\nShape: {df.shape}")
print(f"\nStatistics:\n{df.describe()}")

Sample cuDF DataFrame:
          a         b   c
0  0.614834  0.579210  28
1  0.161892  0.015626  56
2  0.213460  0.177200  40
3  0.741883  0.190413  15
4  0.787238  0.848502   8

Shape: (1000000, 3)

Statistics:
                    a               b               c
count  1000000.000000  1000000.000000  1000000.000000
mean         0.500646        0.499338       49.471185
std          0.288665        0.288690       28.881044
min          0.000002        0.000001        0.000000
25%          0.250617        0.249168       24.000000
50%          0.501314        0.498571       49.000000
75%          0.750870        0.749633       74.000000
max          0.999999        0.999999       99.000000


## Step 6: GPU Memory Info

In [None]:
import torch

def print_gpu_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1e9
        reserved = torch.cuda.memory_reserved(0) / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9

        print(f"GPU Memory:")
        print(f"  Allocated: {allocated:.2f} GB")
        print(f"  Reserved:  {reserved:.2f} GB")
        print(f"  Total:     {total:.2f} GB")
        print(f"  Free:      {total - allocated:.2f} GB")
    else:
        print("No GPU available")

print_gpu_memory()

GPU Memory:
  Allocated: 0.00 GB
  Reserved:  0.00 GB
  Total:     15.83 GB
  Free:      15.83 GB


In [None]:
from google.colab import files
import os

uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"\n‚úÖ Uploaded: {filename}")

Saving Introvert vs Extrovert.csv to Introvert vs Extrovert.csv

‚úÖ Uploaded: Introvert vs Extrovert.csv


In [None]:
import cudf
import pandas as pd

try:
    df = cudf.read_csv(filename)
    print(f"‚úÖ Loaded {len(df):,} rows into GPU memory")
    print(df.head())
except Exception as e:
    print(f"‚ùå Error loading with cuDF: {e}")
    print("Falling back to pandas...")
    df_pd = pd.read_csv(filename)
    df = cudf.from_pandas(df_pd)
    print("‚úÖ Converted from pandas to cuDF")

‚úÖ Loaded 18,524 rows into GPU memory
   id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0   0               0.0         No                      6.0            4.0   
1   1               1.0         No                      7.0            3.0   
2   2               6.0        Yes                      1.0            0.0   
3   3               3.0         No                      7.0            3.0   
4   4               1.0         No                      4.0            4.0   

  Drained_after_socializing  Friends_circle_size Post_frequency Personality  
0                        No                 15.0            5.0   Extrovert  
1                        No                 10.0            8.0   Extrovert  
2                      <NA>                  3.0            0.0   Introvert  
3                        No                 11.0            5.0   Extrovert  
4                        No                 13.0           <NA>   Extrovert  


In [None]:
import time

def run_eda_agent(df):
    start_time = time.time()
    results = {}

    # Basic Info
    results['shape'] = df.shape
    results['missing'] = df.isnull().sum().to_pandas().to_dict()

    # Statistics (GPU-Accelerated)
    numeric_cols = df.select_dtypes(include=['number']).columns
    results['stats'] = df[numeric_cols].describe().to_pandas().to_dict()

    # Outliers (IQR Method on GPU)
    outliers = {}
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        count = ((df[col] < lower) | (df[col] > upper)).sum()
        if count > 0:
            outliers[col] = int(count)
    results['outliers'] = outliers

    duration = time.time() - start_time
    return results, duration

results, duration = run_eda_agent(df)
print(f"‚úÖ EDA Completed in {duration:.2f}s")

‚úÖ EDA Completed in 0.23s


In [None]:
import plotly.express as px
import pandas as pd

print("üìä Dataset Overview:")
print(f"- Rows: {results['shape'][0]:,}")
print(f"- Columns: {results['shape'][1]}")

print("\n‚ö†Ô∏è Missing Values:")
missing_df = pd.Series(results['missing']).reset_index()
missing_df.columns = ['Column', 'Counts']
print(missing_df[missing_df['Counts'] > 0])

print("\nüö® Outliers Detected:")
for col, count in results['outliers'].items():
    print(f"- {col}: {count} outliers")

üìä Dataset Overview:
- Rows: 18,524
- Columns: 9

‚ö†Ô∏è Missing Values:
                      Column  Counts
1           Time_spent_Alone    1190
2                 Stage_fear    1893
3    Social_event_attendance    1180
4              Going_outside    1466
5  Drained_after_socializing    1149
6        Friends_circle_size    1054
7             Post_frequency    1264

üö® Outliers Detected:
- Time_spent_Alone: 1653 outliers


In [None]:
import cudf
import os

# Auto-detect uploaded CSV if not provided
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
if csv_files:
    filename = csv_files[0]
    df = cudf.read_csv(filename)
    print(f"‚úÖ Loaded: {filename} ({len(df):,} rows)")
else:
    print("‚ùå No CSV found. Please upload a file first.")

‚úÖ Loaded: Introvert vs Extrovert.csv (18,524 rows)


In [None]:
target_column = df.columns[-1] # Default to last column
print(f"Target column: '{target_column}'")

X = df.drop(columns=[target_column])
y = df[target_column]

# Handle categorical data for GPU
X = cudf.get_dummies(X)
print(f"Features after encoding: {X.shape[1]}")

Target column: 'Personality'
Features after encoding: 10


In [None]:
from cuml.model_selection import train_test_split

# Fill any remaining NaN values in X with 0 to resolve the 'cupy bool array with nulls' error
X = X.fillna(0)

# Convert categorical target variable 'y' to numerical labels for cuml
y, _ = cudf.factorize(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 14820
Test size: 3704


In [None]:
import xgboost as xgb
from cuml.metrics import accuracy_score, r2_score
import numpy as np # Ensure numpy is imported

# Detect task type
is_classification = len(np.unique(y)) < 10

if is_classification:
    model = xgb.XGBClassifier(
        tree_method='hist',
        device='cuda', # Use 'cuda' for GPU acceleration in XGBoost 3.x+
        eval_metric='logloss'
    )
    metric_name = "Accuracy"
else:
    model = xgb.XGBRegressor(
        tree_method='hist',
        device='cuda' # Use 'cuda' for GPU acceleration in XGBoost 3.x+
    )
    metric_name = "R2 Score"

print(f"üöÄ Training XGBoost on GPU ({'Classification' if is_classification else 'Regression'})...")
model.fit(X_train, y_train)
print("‚úÖ Training Complete!")

üöÄ Training XGBoost on GPU (Classification)...
‚úÖ Training Complete!


In [None]:
preds = model.predict(X_test)

if is_classification:
    score = accuracy_score(y_test, preds)
else:
    score = r2_score(y_test, preds)

print(f"üèÜ Model {metric_name}: {score:.4f}")

üèÜ Model Accuracy: 0.9673


In [None]:
from google.colab import files
import cudf
import os

uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = cudf.read_csv(filename)
print(f"\n‚úÖ Loaded {len(df):,} rows")

Saving Introvert vs Extrovert.csv to Introvert vs Extrovert (1).csv

‚úÖ Loaded 18,524 rows


In [None]:
print("üîç Analyzing data...")
# Statistics
summary = df.describe().to_pandas()
print("\nüìä Numerical Summary:")
display(summary)

# Missing Values
missing = df.isnull().sum()
print(f"\n‚ö†Ô∏è Total Missing Values: {missing.sum()}")

üîç Analyzing data...

üìä Numerical Summary:


Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0



‚ö†Ô∏è Total Missing Values: 9196


In [None]:
import xgboost as xgb
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score, r2_score
import numpy as np

target = df.columns[-1]
X = cudf.get_dummies(df.drop(columns=[target]))
y = df[target]

X = X.fillna(0)

y, _ = cudf.factorize(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

is_clf = len(np.unique(y)) < 10
print(f"ü§ñ Training {'Classification' if is_clf else 'Regression'} model on GPU...")

if is_clf:
    model = xgb.XGBClassifier(
        tree_method='hist',
        device='cuda',
        eval_metric='logloss'
    )
else:
    model = xgb.XGBRegressor(
        tree_method='hist',
        device='cuda'
    )

model.fit(X_train, y_train)
preds = model.predict(X_test)

score = accuracy_score(y_test, preds) if is_clf else r2_score(y_test, preds)
print(f"\n‚ÄÅ Model Score: {score:.4f}")

ü§ñ Training Classification model on GPU...

‚ÄÅ Model Score: 0.9676


In [None]:
# Save the model in XGBoost's native format (JSON)
model_filename_json = "xgboost_model.json"
model.save_model(model_filename_json)
print(f"‚úÖ Model saved to: {model_filename_json}")

# You can load it back later with:
# loaded_model_json = xgb.XGBClassifier() # Or XGBRegressor, depending on your task
# loaded_model_json.load_model(model_filename_json)

import pickle

# Save the model using pickle
model_filename_pkl = "xgboost_model.pkl"
with open(model_filename_pkl, 'wb') as f:
    pickle.dump(model, f)
print(f"‚úÖ Model saved to: {model_filename_pkl}")

# You can load it back later with:
# with open(model_filename_pkl, 'rb') as f:
#     loaded_model_pkl = pickle.load(f)


‚úÖ Model saved to: xgboost_model.json
‚úÖ Model saved to: xgboost_model.pkl
