In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Verify files exist
import os
print("Checking files...")
project_dir = '/content/drive/MyDrive/AI_project'
files = os.listdir(project_dir)
print(f"Files found: {files}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking files...
Files found: ['configs', 'small_preprocessed_data.tar.gz', 'optuna_results_small_dataset', 'src_code.tar.gz', 'optuna_results_small_dataset_2', 'preprocessed_data.tar.gz', 'optuna_results_large_dataset', 'data', 'src', 'checkpoints', 'results']


In [2]:
# Extracting preprocessed data to short term memory
# !tar -xzf /content/drive/MyDrive/AI_project/preprocessed_data.tar.gz -C /content/
# !mkdir -p data/processed
# !mv *.npz dataset_metadata.json data/processed/
# !mv *.npz label_info.json data/processed/

# Extracting preprocessed data to drive memory
!tar -xzf /content/drive/MyDrive/AI_project/preprocessed_data.tar.gz -C /content/drive/MyDrive/AI_project/
!mkdir -p /content/drive/MyDrive/AI_project/data/processed
!mv /content/drive/MyDrive/AI_project/*.npz /content/drive/MyDrive/AI_project/dataset_metadata.json /content/drive/MyDrive/AI_project/data/processed/

# Extract source code to Drive src folder
!tar -xzf /content/drive/MyDrive/AI_project/src_code.tar.gz -C /content/drive/MyDrive/AI_project/

# Verify extractiondrwxr-xr-x 1 roodrwxr-xr-x 1 roodrwxr-xr-x 1 roodrwxr-xr-x 1 root root 4096 Nov 12 01:50 ..t root 4096 Nov 12 01:50 ..t root 4096 Nov 12 01:50 ..t root 4096 Nov 12 01:50 ..
!ls -la /content/drive/MyDrive/AI_project/data/processed/
!ls -la /content/drive/MyDrive/AI_project/src/
!ls -la /content/drive/MyDrive/AI_project/configs/


total 101453
-rw------- 1 root root      363 Nov  9 23:11 dataset_metadata.json
-rw------- 1 root root 15583614 Nov  9 23:11 test_data.npz
-rw------- 1 root root 72719814 Nov  9 23:11 train_data.npz
-rw------- 1 root root 15583614 Nov  9 23:11 val_data.npz
total 148
-rw------- 1 root root 15528 Nov  4 23:05 evaluate.py
-rw------- 1 root root 16373 Nov  9 23:11 load_manifest_data.py
-rw------- 1 root root 10117 Nov  4 23:05 main.py
-rw------- 1 root root 10883 Nov  4 23:05 model.py
-rw------- 1 root root 13607 Nov 10 01:05 optuna_tuning_CPU.py
-rw------- 1 root root 15414 Nov 10 03:28 optuna_tuning_large_GPU.py
-rw------- 1 root root 15878 Nov 10 01:52 optuna_tuning_small_GPU.py
-rw------- 1 root root 14032 Nov 11 04:35 preprocessing.py
drwx------ 2 root root  4096 Nov 11 04:37 __pycache__
-rw------- 1 root root   734 Nov 11 04:18 test_preprocessing.py
-rw------- 1 root root 15921 Nov  4 23:05 train.py
-rw------- 1 root root 14960 Nov  4 23:05 utils.py
total 3
-rw------- 1 root root 245

In [3]:
# Install required packages
!pip install optuna pyyaml tqdm scikit-learn matplotlib astropy

# Check PyTorch and GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4
GPU Memory: 15.83 GB


In [5]:
# Load and verify data
import numpy as np

train_data = np.load('/content/drive/MyDrive/AI_project/data/processed/train_data.npz')
val_data = np.load('/content/drive/MyDrive/AI_project/data/processed/val_data.npz')
test_data = np.load('/content/drive/MyDrive/AI_project/data/processed/test_data.npz')

print("Dataset loaded successfully!")
print(f"Train: {train_data['flux'].shape}")
print(f"Val: {val_data['flux'].shape}")
print(f"Test: {test_data['flux'].shape}")

# Check class distribution
unique, counts = np.unique(train_data['labels'], return_counts=True)
print(f"\nTrain class distribution:")
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} ({100*count/len(train_data['labels']):.1f}%)")


Dataset loaded successfully!
Train: (2100, 4320)
Val: (450, 4320)
Test: (450, 4320)

Train class distribution:
  Class 0: 1045 (49.8%)
  Class 1: 1055 (50.2%)


In [7]:
# Run Optuna tuning
# import sys
# sys.path.append('/content/src')

# Import and run for small data set on GPU
# !python src/optuna_tuning_small_GPU.py

# Import and run large data set on GPU
# !python src/optuna_tuning_large_GPU.py

In [8]:
# After training completes, download results
# from google.colab import files

# Download optimized config
# files.download('optuna_results/optimized_config.yaml')

# Download best model
# files.download('optuna_results/optimization_results.json')



# Download trained model (if using main.py)
# files.download('checkpoints/best_model.pth')


In [9]:
# import json
# import os

# results_dir = '/content/drive/MyDrive/AI_project/optuna_results'
# results_file = os.path.join(results_dir, 'optimization_results.json')

# try:
#     with open(results_file, 'r') as f:
#         previous_optimization_results = json.load(f)

#     print(f"Optuna Optimization Results from {results_dir}:")
#     print(json.dumps(previous_optimization_results, indent=4))

# except FileNotFoundError:
#     print(f"Error: {results_file} not found. Please ensure the directory and file exist in your Google Drive.")
# except json.JSONDecodeError:
#     print(f"Error: Could not decode JSON from {results_file}")

In [10]:
# After training completes, download results
# from google.colab import files

# # Download optimized config
# files.download('optuna_results/optimized_config.yaml')

# # Download best model
# files.download('optuna_results/optimization_results.json')




# Download trained model (if using main.py)
# files.download('checkpoints/best_model.pth')


In [11]:
# Copy results back to Drive for permanent storage
# !cp -r optuna_results /content/drive/MyDrive/AI_project/
# !cp -r checkpoints /content/drive/MyDrive/AI_project/ 2>/dev/null || true

# print("✓ Results saved to Google Drive!")


In [13]:
# ============================================================================
# Google Colab Training Setup
# ============================================================================

import os
import sys
import torch

# Set paths - adjust these to match your Drive structure
PROJECT_ROOT = "/content/drive/MyDrive/AI_project"
CODE_DIR = os.path.join(PROJECT_ROOT, "src")
CONFIG_PATH = os.path.join(PROJECT_ROOT, "optuna_results_large_dataset/optimized_config.yaml")

# Add code directory to Python path
sys.path.insert(0, CODE_DIR)

# Change to project directory
os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

# ============================================================================
# Colab-Optimized Configuration Overrides
# ============================================================================

from utils import load_config, save_config
import yaml

# Load your config
config = load_config(CONFIG_PATH)

# Apply Colab-specific optimizations
print("\nApplying Colab optimizations...")

# DataLoader settings for Colab
config['training']['num_workers'] = 2  # Colab works best with 1-2 workers
config['training']['use_amp'] = True
config['paths']['checkpoint_dir'] = os.path.join(PROJECT_ROOT, "checkpoints")
config['augmentation']['augmentation_factor'] = 2


# Adjust batch size if needed (based on GPU memory)
# config['training']['batch_size'] = 32  # Uncomment to override

# Set device explicitly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Monitor memory usage during augmentation
import psutil
print(f"\nRAM Usage before training: {psutil.virtual_memory().percent}%")
print(f"Available RAM: {psutil.virtual_memory().available / 1e9:.2f} GB")

# ============================================================================
# Run Training
# ============================================================================

# Import your training script components
from preprocessing import LightCurvePreprocessor, DataAugmenter
from model import create_model
from train import Trainer, LightCurveDataset, collate_fn
from utils import set_seed, count_parameters

# Import the main training functions
# Since we can't use argparse in notebook, we'll call functions directly
import importlib.util

# Load the main training module
spec = importlib.util.spec_from_file_location("main_train",
                                               os.path.join(CODE_DIR, "main.py"))
main_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(main_module)

# Call prepare_data and main training loop
print("\n" + "=" * 60)
print("Starting Training Pipeline")
print("=" * 60)

# Prepare data
train_loader, val_loader, test_loader = main_module.prepare_data(config)

# Create model and move to GPU
print("\nCreating model...")
model = create_model(config['model'])
model = model.to(device)  # Move model to GPU

# Print model info
n_params = count_parameters(model)
print(f"Trainable parameters: {n_params['trainable']:,}")
print(f"Total parameters: {n_params['total']:,}")
print(f"Non-trainable parameters: {n_params['non_trainable']:,}")

print(f"Model size (trainable, fp32): ~{n_params['trainable'] * 4 / 1e6:.2f} MB")

# Check GPU memory after model loading
if torch.cuda.is_available():
    print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"GPU Memory cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

# Create trainer
print("\nInitializing trainer...")
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    config=config['training']
)

# Optional: Resume from checkpoint
# CHECKPOINT_PATH = "/content/drive/MyDrive/YourProjectFolder/checkpoints/checkpoint_epoch_10.pth"
# trainer.load_checkpoint(CHECKPOINT_PATH)

# Train the model
print("\n" + "=" * 60)
print("Starting training...")
print("=" * 60)

if torch.cuda.is_available():
    torch.cuda.empty_cache()

try:
    trainer.train(num_epochs=config['training']['num_epochs'])
    print("\nTraining completed successfully!")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("\n" + "!" * 60)
        print("GPU OUT OF MEMORY ERROR")
        print("!" * 60)
        print("Try reducing batch_size or model size")
        print(f"Current batch_size: {config['training']['batch_size']}")
        print(f"Current d_model: {config['model']['d_model']}")

        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    raise e

# Check final memory usage
print(f"\nFinal RAM Usage: {psutil.virtual_memory().percent}%")
if torch.cuda.is_available():
    print(f"Final GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")

# ============================================================================
# Evaluate on Test Set
# ============================================================================

print("\n" + "=" * 60)
print("Evaluating on test set...")
print("=" * 60)

from evaluate import ModelEvaluator
from pathlib import Path
import matplotlib.pyplot as plt

# Load best model
best_model_path = Path(config['paths']['checkpoint_dir']) / 'best_model.pth'
checkpoint = torch.load(best_model_path, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)

evaluator = ModelEvaluator(model)
metrics = evaluator.evaluate(
    test_loader,
    class_names=config.get('classes', None),
    save_dir=config['paths']['results_dir']
)

print("\n" + "=" * 60)
print("FINAL RESULTS")
print("=" * 60)
print(f"Best validation loss: {checkpoint['best_val_loss']:.4f}")
print(f"Test accuracy: {metrics['accuracy']:.4f}")
print(f"Test precision: {metrics['precision']:.4f}")
print(f"Test recall: {metrics['recall']:.4f}")
print(f"Test F1: {metrics['f1_score']:.4f}")
auc_score = metrics.get('auc', metrics.get('roc_auc', 0.0)) * 100
print(f"Test AUC: {auc_score:.4f}")
print(f"\nResults saved to: {config['paths']['results_dir']}")

# ============================================================================
# Visualize Performance Metrics
# ============================================================================

# Convert metrics to percentages
metric_names = ['AUC', 'F1 Score', 'Recall', 'Precision', 'Accuracy']
metric_values = [
    auc_score,
    metrics['f1_score'],
    metrics['recall'] * 100,
    metrics['precision'] * 100,
    metrics['accuracy'] * 100
]

# Sort metrics for better visualization
# sorted_pairs = sorted(zip(metric_values, metric_names), reverse=True)
# sorted_values, sorted_names = zip(*sorted_pairs)

# Create horizontal bar chart
plt.figure(figsize=(10, 5))
bars = plt.barh(metric_names, metric_values, color='blue', edgecolor='navy', alpha=0.5)
plt.xlabel('Score (%)', fontsize=12)
plt.title('Model Performance Metrics', fontsize=14, fontweight='bold')
plt.xlim(0, 100)

# Annotate bars with score values
for bar, score in zip(bars, metric_values):
    plt.text(score + 1, bar.get_y() + bar.get_height()/2,
             f'{score:.1f}%', va='center', fontsize=10)

# plt.gca().invert_yaxis()  # Highest score at the top
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()

# Save the figure
metrics_plot_path = Path(config['paths']['results_dir']) / 'performance_metrics.png'
plt.savefig(metrics_plot_path, dpi=300, bbox_inches='tight')
print(f"Performance metrics plot saved to: {metrics_plot_path}")

# Display in notebook
plt.show()

# Optional: Download results to local machine
from google.colab import files
files.download(os.path.join(config['paths']['results_dir'], 'confusion_matrix.png'))
#files.download(os.path.join(config['paths']['checkpoint_dir'], 'best_model.pth'))


Working directory: /content/drive/MyDrive/AI_project

Applying Colab optimizations...
Using device: cuda

RAM Usage before training: 13.8%
Available RAM: 11.73 GB

Starting Training Pipeline
Preparing datasets...
Loading preprocessed data...
  Train: 2100 samples
  Val:   450 samples
  Test:  450 samples

Original class distribution:
  Train: class_0=1045 (49.8%) class_1=1055 (50.2%)
  Val: class_0=241 (53.6%) class_1=209 (46.4%)
  Test: class_0=214 (47.6%) class_1=236 (52.4%)
Applying data augmentation...

Applying additional augmentation (multiplier: 2)...
  Augmentation complete!                              
  Original size: 2100
  Augmented size: 4200
  Increase: 2.0x

Augmented class distribution:
  Train: class_0=2090 (49.8%) class_1=2110 (50.2%)
Training samples: 4200
Validation samples: 450
Test samples: 450

Creating model...
Trainable parameters: 108,706
Total parameters: 108,706
Non-trainable parameters: 0
Model size (trainable, fp32): ~0.43 MB
GPU Memory allocated: 0.00 GB




Train Loss: 0.6703 | Train Acc: 58.17%
Val Loss:   0.6338 | Val Acc:   60.22%
LR:         9.53e-05
✓ Validation improved by inf
✓ Saved best model (val_loss: 0.6338)

Epoch 2/100
----------------------------------------------------------------------





Train Loss: 0.6320 | Train Acc: 63.50%
Val Loss:   0.6339 | Val Acc:   60.44%
LR:         1.91e-04
✗ No improvement for 1 epochs

Epoch 3/100
----------------------------------------------------------------------





Train Loss: 0.6247 | Train Acc: 63.67%
Val Loss:   0.6237 | Val Acc:   61.33%
LR:         2.86e-04
✓ Validation improved by 0.0101
✓ Saved best model (val_loss: 0.6237)

Epoch 4/100
----------------------------------------------------------------------





Train Loss: 0.6167 | Train Acc: 64.95%
Val Loss:   0.6334 | Val Acc:   59.78%
LR:         3.81e-04
✗ No improvement for 1 epochs

Epoch 5/100
----------------------------------------------------------------------





Train Loss: 0.6217 | Train Acc: 64.67%
Val Loss:   0.6382 | Val Acc:   60.44%
LR:         4.77e-04
✗ No improvement for 2 epochs

Epoch 6/100
----------------------------------------------------------------------





Train Loss: 0.6198 | Train Acc: 63.64%
Val Loss:   0.6259 | Val Acc:   59.56%
LR:         4.76e-04
✗ No improvement for 3 epochs

Epoch 7/100
----------------------------------------------------------------------





Train Loss: 0.6143 | Train Acc: 64.48%
Val Loss:   0.6208 | Val Acc:   61.33%
LR:         4.76e-04
✓ Validation improved by 0.0029
✓ Saved best model (val_loss: 0.6208)

Epoch 8/100
----------------------------------------------------------------------





Train Loss: 0.6086 | Train Acc: 64.71%
Val Loss:   0.6325 | Val Acc:   57.78%
LR:         4.75e-04
✗ No improvement for 1 epochs

Epoch 9/100
----------------------------------------------------------------------





Train Loss: 0.6078 | Train Acc: 65.29%
Val Loss:   0.6465 | Val Acc:   60.00%
LR:         4.75e-04
✗ No improvement for 2 epochs

Epoch 10/100
----------------------------------------------------------------------





Train Loss: 0.6108 | Train Acc: 64.12%
Val Loss:   0.6315 | Val Acc:   61.33%
LR:         4.73e-04
✗ No improvement for 3 epochs

Epoch 11/100
----------------------------------------------------------------------





Train Loss: 0.6004 | Train Acc: 65.07%
Val Loss:   0.6447 | Val Acc:   60.22%
LR:         4.72e-04
✗ No improvement for 4 epochs

Epoch 12/100
----------------------------------------------------------------------





Train Loss: 0.5994 | Train Acc: 65.60%
Val Loss:   0.6444 | Val Acc:   60.44%
LR:         4.70e-04
✗ No improvement for 5 epochs

Early stopping triggered after 12 epochs

Training completed in 7.18 minutes
Best validation loss: 0.6208


Training completed successfully!

Final RAM Usage: 20.2%
Final GPU Memory allocated: 0.02 GB

Evaluating on test set...

STARTING EVALUATION


Predicting: 100%|██████████| 29/29 [00:01<00:00, 21.00it/s]



Evaluation Metrics:
accuracy            : 0.6333
precision           : 0.6437
recall              : 0.6737
f1_score            : 0.6584
roc_auc             : 0.6766

Classification Report:
              precision    recall  f1-score   support

 Non-Transit     0.6207    0.5888    0.6043       214
     Transit     0.6437    0.6737    0.6584       236

    accuracy                         0.6333       450
   macro avg     0.6322    0.6313    0.6314       450
weighted avg     0.6328    0.6333    0.6327       450


Saving visualizations to results
Saved confusion matrix to results/confusion_matrix.png
Saved ROC curve to results/roc_curve.png

EVALUATION COMPLETE

FINAL RESULTS
Best validation loss: 0.6208
Test accuracy: 0.6333
Test precision: 0.6437
Test recall: 0.6737
Test F1: 0.6584
Test AUC: 67.6560

Results saved to: results


NameError: name 'f1_score' is not defined

In [None]:
# Optional: Download results to local machine
# from google.colab import files
# files.download(os.path.join(config['paths']['results_dir'], 'confusion_matrix.png'))