# VajraCode: Getting Started

This notebook demonstrates the basic workflow for longitudinal mental health modeling using VajraCode.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data import DataLoader, split_temporal
from src.models import LinearModel, LogisticModel
from src.evaluation import ModelEvaluator, cross_validate_temporal
from src.visualization import (
    plot_longitudinal,
    plot_feature_distributions,
    plot_correlation_matrix,
    plot_predictions_vs_actual
)

%matplotlib inline

## 1. Load Data

First, let's create some synthetic longitudinal data for demonstration.

In [None]:
# Create synthetic data
loader = DataLoader()
dataset = loader.create_synthetic_data(
    n_subjects=100,
    n_timepoints=5,
    n_features=10,
    random_seed=42
)

print("Dataset created successfully!")
print(f"Shape: {dataset.data.shape}")
dataset.data.head()

## 2. Explore the Data

In [None]:
# Dataset summary
summary = dataset.summary()
for key, value in summary.items():
    print(f"{key}: {value}")

In [None]:
# Visualize longitudinal trajectories
fig = plot_longitudinal(
    dataset,
    n_subjects=10,
    feature_col='outcome_continuous'
)
plt.show()

In [None]:
# Feature distributions
feature_cols = dataset.get_feature_columns(
    exclude=['outcome_binary', 'outcome_continuous']
)
fig = plot_feature_distributions(dataset, feature_cols=feature_cols[:6])
plt.show()

In [None]:
# Correlation matrix
fig = plot_correlation_matrix(dataset, feature_cols=feature_cols)
plt.show()

## 3. Train a Regression Model

Let's train a linear regression model to predict continuous outcomes.

In [None]:
# Prepare data
target_col = 'outcome_continuous'
feature_cols = dataset.get_feature_columns(
    exclude=['outcome_binary', 'outcome_continuous']
)

# Temporal split
all_timepoints = sorted(dataset.data[dataset.time_col].unique())
train_timepoints = all_timepoints[:3]  # First 3 timepoints
test_timepoints = all_timepoints[3:]   # Last 2 timepoints

print(f"Train timepoints: {train_timepoints}")
print(f"Test timepoints: {test_timepoints}")

In [None]:
# Train model
model = LinearModel(name="LinearRegression")
results = cross_validate_temporal(
    model=model,
    dataset=dataset,
    feature_cols=feature_cols,
    target_col=target_col,
    train_timepoints=train_timepoints,
    test_timepoints=test_timepoints,
    task='regression'
)

print("\nResults:")
for metric, value in results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Visualize predictions
test_data = dataset.data[dataset.data[dataset.time_col].isin(test_timepoints)]
X_test = test_data[feature_cols].values
y_test = test_data[target_col].values
y_pred = model.predict(X_test)

fig = plot_predictions_vs_actual(y_test, y_pred, task='regression')
plt.show()

## 4. Train a Classification Model

Now let's train a logistic regression model for binary classification.

In [None]:
# Train classification model
target_col = 'outcome_binary'
model_cls = LogisticModel(name="LogisticRegression")

results_cls = cross_validate_temporal(
    model=model_cls,
    dataset=dataset,
    feature_cols=feature_cols,
    target_col=target_col,
    train_timepoints=train_timepoints,
    test_timepoints=test_timepoints,
    task='classification'
)

print("\nClassification Results:")
for metric, value in results_cls['metrics'].items():
    if metric != 'confusion_matrix':
        print(f"  {metric}: {value:.4f}")

In [None]:
# Confusion matrix
y_pred_cls = model_cls.predict(X_test)
y_test_cls = test_data['outcome_binary'].values

fig = plot_predictions_vs_actual(y_test_cls, y_pred_cls, task='classification')
plt.show()

## 5. Model Interpretation

Examine the most important features.

In [None]:
# Get coefficients
coefs = model_cls.get_coefficients()
coef_df = pd.DataFrame(list(coefs.items()), columns=['Feature', 'Coefficient'])
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)

print("Top 10 Most Important Features:")
print(coef_df.head(10))

In [None]:
# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 6))
coef_df_top = coef_df.head(10)
ax.barh(coef_df_top['Feature'], coef_df_top['Coefficient'])
ax.set_xlabel('Coefficient Value')
ax.set_title('Top 10 Feature Coefficients')
plt.tight_layout()
plt.show()

## Next Steps

- Load your own data using `DataLoader.load_csv()` or `DataLoader.load_excel()`
- Experiment with different feature sets and model parameters
- Try different temporal splits for cross-validation
- Extend the codebase with your own models and evaluation metrics