# Hotspot Prediction Model Training

This notebook demonstrates the training process for our code hotspot prediction models. We'll use historical repository data to train models that can identify high-risk areas in the codebase.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Custom imports
from app.services.git_analysis import GitAnalysisService
from app.services.embedding import EmbeddingService

## Data Collection

First, we'll collect historical data from the Git repository to use as training data.

In [None]:
def collect_training_data(repo_path):
    git_service = GitAnalysisService()
    git_service.initialize_repo(repo_path)
    
    data = []
    for commit in git_service.repo.iter_commits():
        for file in commit.stats.files:
            stats = commit.stats.files[file]
            data.append({
                'file_path': file,
                'lines_changed': stats['lines'],
                'insertions': stats['insertions'],
                'deletions': stats['deletions'],
                'commit_frequency': 1,  # Will be aggregated later
                'complexity': git_service._calculate_complexity(file),
                'is_hotspot': False  # Will be determined based on metrics
            })
    
    return pd.DataFrame(data)

## Feature Engineering

We'll create relevant features for our model based on the collected data.

In [None]:
def engineer_features(df):
    # Aggregate by file path
    features = df.groupby('file_path').agg({
        'lines_changed': 'sum',
        'insertions': 'sum',
        'deletions': 'sum',
        'commit_frequency': 'sum',
        'complexity': 'mean'
    }).reset_index()
    
    # Calculate additional features
    features['change_rate'] = features['lines_changed'] / features['commit_frequency']
    features['insertion_deletion_ratio'] = features['insertions'] / (features['deletions'] + 1)
    
    # Define hotspots (you may want to adjust these thresholds)
    features['is_hotspot'] = (
        (features['commit_frequency'] > features['commit_frequency'].quantile(0.75)) &
        (features['complexity'] > features['complexity'].quantile(0.75))
    )
    
    return features

## Model Training

Now we'll train our Random Forest model to predict hotspots.

In [None]:
def train_model(features):
    # Prepare features and target
    X = features[[
        'lines_changed',
        'commit_frequency',
        'complexity',
        'change_rate',
        'insertion_deletion_ratio'
    ]]
    y = features['is_hotspot']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    # Evaluate model
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    
    print(f"Train accuracy: {train_score:.3f}")
    print(f"Test accuracy: {test_score:.3f}")
    
    return model, scaler

## Save Model

Finally, we'll save our trained model for use in the application.

In [None]:
def save_model(model, scaler, model_path, scaler_path):
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    print(f"Model saved to {model_path}")
    print(f"Scaler saved to {scaler_path}")

## Run Training Pipeline

Let's run our complete training pipeline.

In [None]:
# Set paths
REPO_PATH = "path/to/your/repo"
MODEL_PATH = "../ai/models/hotspot_prediction_model.joblib"
SCALER_PATH = "../ai/models/hotspot_prediction_scaler.joblib"

# Collect and prepare data
raw_data = collect_training_data(REPO_PATH)
features = engineer_features(raw_data)

# Train model
model, scaler = train_model(features)

# Save model and scaler
save_model(model, scaler, MODEL_PATH, SCALER_PATH)