# Animal Adoption Prediction Model Training
This notebook trains a machine learning model to predict animal adoption outcomes using the insights from our data exploration.

## Cell 1: Setup and Import Libraries

In [1]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style for better visualizations
plt.style.use('default')
sns.set_palette("husl")
print("✅ Dependencies imported successfully")

## Cell 2: Configuration and Initialization

In [2]:
# Configuration - Get these values from terraform output
import subprocess

def get_terraform_output(output_name):
    """Helper function to get terraform output values"""
    # Find terraform directory - check multiple possible locations
    possible_paths = [
        '../terraform',           # If running from notebooks/ subdirectory
        './terraform',            # If running from project root
        '../../terraform',        # If running from notebooks/subfolder
    ]
    
    terraform_dir = None
    for path in possible_paths:
        if os.path.exists(path) and os.path.isfile(os.path.join(path, 'main.tf')):
            terraform_dir = path
            break
    
    if not terraform_dir:
        raise FileNotFoundError(
            "Could not find terraform directory. Make sure you're running this notebook "
            "from the project root or notebooks/ directory, and that terraform/ exists."
        )
    
    result = subprocess.run(
        ['terraform', 'output', '-raw', output_name], 
        cwd=terraform_dir, 
        capture_output=True, 
        text=True
    )
    return result.stdout.strip()

try:
    # Get actual values from your deployed infrastructure
    BUCKET_NAME = get_terraform_output('s3_bucket_name')
    SAGEMAKER_ROLE = get_terraform_output('sagemaker_role_arn')
    REGION = 'us-east-1'
    
    print(f"Using bucket: {BUCKET_NAME}")
    print(f"Using role: {SAGEMAKER_ROLE}")
    
    # Initialize SageMaker session
    sagemaker_session = sagemaker.Session()
    
except Exception as e:
    print(f"❌ Could not get Terraform outputs: {e}")
    print("Using local configuration for testing")
    BUCKET_NAME = None
    SAGEMAKER_ROLE = None
    REGION = 'us-east-1'

## Cell 3: Data Loading Function

In [3]:
def prepare_data():
    """Load and prepare data from processed file, S3, or raw data"""
    # Try multiple data sources in order of preference
    
    # 1. Try processed data from data exploration notebook
    try:
        df = pd.read_csv('../data/processed_animal_data.csv')
        print(f"✅ Loaded processed data from file: {len(df):,} records")
        return df
    except FileNotFoundError:
        print("No processed data file found, trying other sources...")
    
    # 2. Try raw data from local file
    try:
        print("Loading raw data from local file...")
        df = pd.read_csv('../data/austin_animal_outcomes.csv')
        print(f"✅ Loaded raw data from file: {len(df):,} records")
        
        # Add age in days if not present
        if 'age_in_days' not in df.columns:
            print("Converting age to days...")
            df['age_in_days'] = df['Age upon Outcome'].apply(parse_age_in_days)
            
        # Add time features
        print("Adding time features...")
        df['DateTime'] = pd.to_datetime(df['DateTime'])
        df['outcome_month'] = df['DateTime'].dt.month
        df['outcome_hour'] = df['DateTime'].dt.hour
        
        return df
        
    except FileNotFoundError:
        print("No local data file found, trying S3...")
    
    # 3. Try S3 as last resort (if bucket is configured)
    if BUCKET_NAME:
        try:
            print(f"Attempting to load from S3 bucket: {BUCKET_NAME}")
            s3 = boto3.client('s3')
            
            # List available files in the bucket
            try:
                response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix='raw/')
                if 'Contents' in response:
                    print("Available files in S3:")
                    for obj in response['Contents']:
                        print(f"  - {obj['Key']}")
                    
                    # Try to find a CSV file
                    csv_files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.csv')]
                    if csv_files:
                        key = csv_files[0]  # Use the first CSV file found
                        print(f"Loading from S3: {key}")
                        obj = s3.get_object(Bucket=BUCKET_NAME, Key=key)
                        df = pd.read_csv(obj['Body'])
                        print(f"✅ Loaded data from S3: {len(df):,} records")
                        
                        # Add age in days if not present
                        if 'age_in_days' not in df.columns:
                            print("Converting age to days...")
                            df['age_in_days'] = df['Age upon Outcome'].apply(parse_age_in_days)
                            
                        # Add time features
                        print("Adding time features...")
                        df['DateTime'] = pd.to_datetime(df['DateTime'])
                        df['outcome_month'] = df['DateTime'].dt.month
                        df['outcome_hour'] = df['DateTime'].dt.hour
                        
                        return df
                else:
                    print("No files found in S3 bucket")
            except Exception as e:
                print(f"Error listing S3 objects: {e}")
                
        except Exception as e:
            print(f"Failed to load from S3: {e}")
    else:
        print("No S3 bucket configured, skipping S3 check")
    
    # 4. If all else fails, provide instructions
    raise FileNotFoundError(
        "❌ Could not find data in any location!\n"
        "Please ensure you have one of:\n"
        "1. Run the data exploration notebook first to create processed_animal_data.csv\n"
        "2. Place austin_animal_outcomes.csv in the ../data/ directory\n"
        "3. Upload data to your S3 bucket in the raw/ folder"
    )

def parse_age_in_days(age_str):
    """Convert age strings to days"""
    if pd.isna(age_str):
        return np.nan
    
    age_str = str(age_str).lower().strip()
    
    try:
        if 'year' in age_str:
            return float(age_str.split()[0]) * 365
        elif 'month' in age_str:
            return float(age_str.split()[0]) * 30
        elif 'week' in age_str:
            return float(age_str.split()[0]) * 7
        elif 'day' in age_str:
            return float(age_str.split()[0])
        else:
            return np.nan
    except (ValueError, IndexError):
        return np.nan

# Load the data
print("🔍 Looking for data sources...")
df = prepare_data()

# Display basic info about the dataset
print(f"\n📊 Dataset Overview:")
print(f"Dataset shape: {df.shape}")
if 'Outcome Type' in df.columns:
    print(f"Outcome types: {df['Outcome Type'].value_counts().to_dict()}")
else:
    print("Available columns:", list(df.columns))

## Cell 4: Data Preprocessing Function

In [4]:
def preprocess_features(df):
    """Preprocess features for training"""
    # Create a copy to avoid modifying original
    df_processed = df.copy()
    
    # Define categorical columns that need special handling
    categorical_columns = ['Animal Type', 'Sex upon Outcome', 'Breed', 'Color']
    
    # Process each categorical column
    for col in categorical_columns:
        if col in df_processed.columns:
            # Fill missing values with 'unknown'
            df_processed[col] = df_processed[col].fillna('unknown')
            
            # Create simplified column name for ML
            simple_name = col.lower().replace(' ', '_').replace('upon_', '')
            
            # For breed, extract primary breed (first word)
            if col == 'Breed':
                df_processed['primary_breed'] = df_processed[col].str.split().str[0]
                # Limit to top 20 breeds to prevent overfitting
                top_breeds = df_processed['primary_breed'].value_counts().head(20).index
                df_processed['primary_breed'] = df_processed['primary_breed'].apply(
                    lambda x: x if x in top_breeds else 'other'
                )
            else:
                df_processed[simple_name] = df_processed[col]
                # Limit to top categories for other fields too
                if col != 'Animal Type':  # Keep all animal types
                    top_categories = df_processed[simple_name].value_counts().head(20).index
                    df_processed[simple_name] = df_processed[simple_name].apply(
                        lambda x: x if x in top_categories else 'other'
                    )
    
    # Handle missing numerical values
    if 'age_in_days' in df_processed.columns:
        df_processed['age_in_days'] = df_processed['age_in_days'].fillna(
            df_processed['age_in_days'].median()
        )
    
    # Add time features if DateTime exists
    if 'DateTime' in df_processed.columns:
        df_processed['DateTime'] = pd.to_datetime(df_processed['DateTime'])
        if 'outcome_month' not in df_processed.columns:
            df_processed['outcome_month'] = df_processed['DateTime'].dt.month
        if 'outcome_hour' not in df_processed.columns:
            df_processed['outcome_hour'] = df_processed['DateTime'].dt.hour
    
    # Create binary target: 1 for Adoption, 0 for others
    df_processed['adopted_label'] = (df_processed['Outcome Type'] == 'Adoption').astype(int)
    
    print(f"✅ Preprocessing complete. Binary adoption rate: {df_processed['adopted_label'].mean():.3f}")
    
    return df_processed

# Preprocess the data
df_processed = preprocess_features(df)
print(f"Processed dataset shape: {df_processed.shape}")

## Cell 5: Model Training Function

In [5]:
def train_model():
    """Main training function"""
    
    # Define features for training
    feature_columns = ['animal_type', 'sex_outcome', 'age_in_days', 'primary_breed', 'color', 'outcome_month']
    
    # Check which features exist
    available_features = [col for col in feature_columns if col in df_processed.columns]
    print(f"Available features: {available_features}")
    
    # Use available features
    X = df_processed[available_features]
    y = df_processed['adopted_label']  # Target: 1 = adopted, 0 = not adopted
    
    # Split data for training and validation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Training adoption rate: {y_train.mean():.3f}")
    print(f"Test adoption rate: {y_test.mean():.3f}")
    
    return X_train, X_test, y_train, y_test

# Split the data
X_train, X_test, y_train, y_test = train_model()

## Cell 6: Feature Encoding

In [6]:
def encode_features(X_train, X_test):
    """Convert categorical text data to numbers for machine learning"""
    # Create copies to avoid modifying originals
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    
    # Store encoders for later use
    label_encoders = {}
    
    # Encode categorical features
    for column in X_train_encoded.columns:
        if X_train_encoded[column].dtype == 'object':
            print(f"Encoding {column}...")
            
            # Create and fit encoder on training data
            encoder = LabelEncoder()
            X_train_encoded[column] = encoder.fit_transform(X_train_encoded[column].astype(str))
            
            # Apply to test data, handling unseen categories
            test_values = X_test_encoded[column].astype(str)
            # Map unseen categories to 'unknown' if it exists, otherwise to the most common class
            unseen_mask = ~test_values.isin(encoder.classes_)
            if unseen_mask.any():
                print(f"  Found {unseen_mask.sum()} unseen categories in {column}")
                if 'unknown' in encoder.classes_:
                    test_values[unseen_mask] = 'unknown'
                else:
                    # Use most common class
                    most_common = X_train[column].mode().iloc[0]
                    test_values[unseen_mask] = most_common
            
            X_test_encoded[column] = encoder.transform(test_values)
            label_encoders[column] = encoder
            
            print(f"  {column}: {len(encoder.classes_)} unique values")
    
    return X_train_encoded, X_test_encoded, label_encoders

# Encode features
X_train_encoded, X_test_encoded, label_encoders = encode_features(X_train, X_test)
print(f"\n✅ Feature encoding complete!")
print(f"Training features shape: {X_train_encoded.shape}")
print(f"Test features shape: {X_test_encoded.shape}")

## Cell 7: Model Training and Evaluation

In [7]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    """Train the Random Forest model and evaluate its performance"""
    
    print("🚀 Training Random Forest model...")
    
    # Create and train the model
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1  # Use all available cores
    )
    
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of adoption
    
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n📊 Model Performance:")
    print(f"Accuracy: {accuracy:.3f}")
    
    print(f"\n📋 Detailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Not Adopted', 'Adopted']))
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n🔍 Top 10 Most Important Features:")
    for idx, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    feature_importance.head(10).plot(x='feature', y='importance', kind='bar')
    plt.title('Top 10 Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return model

# Train and evaluate the model
model = train_and_evaluate(X_train_encoded, X_test_encoded, y_train, y_test)

## Cell 8: Save Model and Encoders

In [8]:
def save_model_artifacts():
    """Save the trained model and encoders for later use"""
    
    # Create models directory if it doesn't exist
    os.makedirs('../models', exist_ok=True)
    
    # Save the trained model
    model_path = '../models/animal_adoption_model.pkl'
    joblib.dump(model, model_path)
    print(f"✅ Model saved to {model_path}")
    
    # Save the label encoders
    encoders_path = '../models/label_encoders.pkl'
    joblib.dump(label_encoders, encoders_path)
    print(f"✅ Encoders saved to {encoders_path}")
    
    # Save feature names for reference
    feature_info = {
        'feature_names': list(X_train_encoded.columns),
        'n_features': len(X_train_encoded.columns),
        'target_name': 'adopted_label'
    }
    
    import json
    with open('../models/model_info.json', 'w') as f:
        json.dump(feature_info, f, indent=2)
    print(f"✅ Model info saved to ../models/model_info.json")
    
    print(f"\n🎯 Model training completed and artifacts saved!")
    
    return model_path, encoders_path

# Save all artifacts
model_path, encoders_path = save_model_artifacts()

print("\n📦 Training Summary:")
print(f"Model type: Random Forest Classifier")
print(f"Features used: {len(X_train_encoded.columns)}")
print(f"Training samples: {len(X_train_encoded)}")
print(f"Test samples: {len(X_test_encoded)}")
print(f"Model saved to: {model_path}")
print(f"Encoders saved to: {encoders_path}")

## Cell 9: Test Model Loading (Verification)

In [9]:
# Test loading the saved model to make sure it works
print("🔍 Testing saved model loading...")

# Load the model
loaded_model = joblib.load(model_path)
loaded_encoders = joblib.load(encoders_path)

# Test prediction on a few samples
test_predictions = loaded_model.predict(X_test_encoded[:5])
test_probabilities = loaded_model.predict_proba(X_test_encoded[:5])[:, 1]

print(f"✅ Model loaded successfully!")
print(f"Test predictions: {test_predictions}")
print(f"Test probabilities: {test_probabilities.round(3)}")
print(f"Actual values: {y_test.iloc[:5].values}")

print("\n🎉 ML Training Pipeline Complete!")
print("Ready for model deployment to SageMaker.")