In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

In [2]:
def load_data(file_path, target_col='stress_score_weighted'):
    """Load and prepare data from a CSV file with improved error handling"""
    # Load data
    try:
        # Try to read CSV with pandas, which will handle the header row correctly
        df = pd.read_csv(file_path)
        print(f"Successfully loaded data with shape: {df.shape}")
        
        # Print first few rows to help diagnose issues
        print("\nFirst 5 rows of data:")
        print(df.head())
        
        # Print column names to help identify issues
        print("\nColumn names:")
        print(df.columns.tolist())
        
        # Check for the target column
        if target_col not in df.columns:
            # If target column not found, check if there's a similar column
            possible_targets = [col for col in df.columns if 'stress' in col.lower()]
            if possible_targets:
                print(f"\nTarget column '{target_col}' not found, but found these possible stress-related columns:")
                print(possible_targets)
                target_col = possible_targets[0]  # Use the first match
                print(f"Using '{target_col}' as the target column")
            else:
                raise ValueError(f"Target column '{target_col}' not found in dataset and no stress-related columns found")
        
        # Check for non-numeric columns
        non_numeric_cols = []
        for col in df.columns:
            try:
                pd.to_numeric(df[col])
            except:
                non_numeric_cols.append(col)
        
        if non_numeric_cols:
            print(f"\nWarning: Found non-numeric columns: {non_numeric_cols}")
            print("These columns will be dropped for modeling")
            
            # Drop non-numeric columns except the target (we'll handle that separately)
            for col in non_numeric_cols:
                if col != target_col:
                    df = df.drop(columns=[col])
        
        # Handle the target column
        try:
            # Try to convert target to numeric
            df[target_col] = pd.to_numeric(df[target_col])
        except:
            print(f"\nError: Target column '{target_col}' contains non-numeric values")
            print("Sample values from target column:")
            print(df[target_col].head(10).tolist())
            raise ValueError(f"Target column '{target_col}' must be numeric")
        
        # Extract target variable and features
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        # Convert all remaining columns to numeric (just to be safe)
        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce')
        
        # Handle any NaN values created by coercion
        X = X.fillna(X.mean())
        
        print(f"\nFeatures shape after preprocessing: {X.shape}")
        print(f"Target shape: {y.shape}")
        
        return X, y
        
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

In [3]:
def preprocess_data(X, y, test_size=0.2, scaler_type='standard', select_k_features=None):
    """Preprocess the data: split, scale, and optionally select features"""
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Scale features
    if scaler_type == 'robust':
        scaler = RobustScaler()  # Less influenced by outliers
    else:
        scaler = StandardScaler()  # Standard z-score normalization
        
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Optional feature selection
    feature_selector = None
    if select_k_features is not None and select_k_features < X.shape[1]:
        feature_selector = SelectKBest(f_regression, k=select_k_features)
        X_train_scaled = feature_selector.fit_transform(X_train_scaled, y_train)
        X_test_scaled = feature_selector.transform(X_test_scaled)
        
        # Get selected feature names
        selected_indices = feature_selector.get_support(indices=True)
        selected_features = X.columns[selected_indices]
        print(f"Selected top {select_k_features} features: {selected_features.tolist()}")
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, feature_selector

In [4]:
def build_model(input_dim, architecture=[128, 64, 32], dropout_rate=0.3,
               learning_rate=0.001, activation='leaky_relu'):
    """Build a neural network model for stress prediction"""
    model = Sequential()
    
    # Input layer
    if activation == 'leaky_relu':
        model.add(Dense(architecture[0], input_dim=input_dim))
        model.add(LeakyReLU(alpha=0.1))
    else:
        model.add(Dense(architecture[0], input_dim=input_dim, activation=activation))
        
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))
    
    # Hidden layers
    for units in architecture[1:]:
        if activation == 'leaky_relu':
            model.add(Dense(units))
            model.add(LeakyReLU(alpha=0.1))
        else:
            model.add(Dense(units, activation=activation))
            
        model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))
    
    # Output layer for regression
    model.add(Dense(1, activation='linear'))
    
    # Compile the model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    # Display model summary
    model.summary()
    
    return model

In [5]:
def train_model(model, X_train, y_train, epochs=200, batch_size=32, validation_split=0.2,
               patience=25, verbose=1, model_path=None):
    """Train the model with early stopping and learning rate reduction"""
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=1e-6)
    ]
    
    # Add model checkpoint if path provided
    if model_path:
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        callbacks.append(ModelCheckpoint(
            f'{model_path}/best_model.h5', save_best_only=True, monitor='val_loss'
        ))
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=validation_split,
        callbacks=callbacks,
        verbose=verbose
    )
    
    # Plot training history
    plot_training_history(history)
    
    return model, history

In [74]:
def evaluate_model(model, X_test, y_test):
    """Evaluate the model on test data"""
    # Make predictions
    y_pred = model.predict(X_test).flatten()
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }
    
    # Print metrics
    print("\nModel Evaluation:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
        
    # Plot predictions
    plot_predictions(y_test, y_pred)
        
    return metrics, y_pred

In [76]:
def feature_importance(X, y):
    """Calculate feature importance using correlation with target"""
    # Create a dataframe with features and target
    df = pd.concat([X, pd.Series(y, name='target')], axis=1)
    
    # Calculate correlation with target
    correlations = df.corr()['target'].drop('target').abs().sort_values(ascending=False)
    
    # Display top correlations
    print("\nTop feature correlations with stress score:")
    print(correlations.head(10))
    
    # Plot top 15 correlations
    plt.figure(figsize=(12, 8))
    top_15 = correlations.head(15)
    sns.barplot(x=top_15.values, y=top_15.index)
    plt.title('Top 15 Features by Correlation with Stress Score')
    plt.xlabel('Absolute Correlation')
    plt.tight_layout()
    plt.show()
    
    return correlations

In [77]:
def cross_validate(X, y, n_splits=5, epochs=100, batch_size=32):
    """Perform k-fold cross-validation"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_metrics = []
    
    print(f"\nPerforming {n_splits}-fold cross-validation:")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nFold {fold+1}/{n_splits}")
        
        # Split data
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        # Preprocess
        scaler = RobustScaler()  # Use RobustScaler for cross-validation
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)
        
        # Build model for this fold
        input_dim = X_train_scaled.shape[1]
        model = build_model(input_dim)
        
        # Training with early stopping
        callbacks = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)]
        model.fit(
            X_train_scaled, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_scaled, y_val_fold),
            callbacks=callbacks,
            verbose=0
        )
        
        # Evaluate
        y_pred = model.predict(X_val_scaled).flatten()
        metrics = {
            'MSE': mean_squared_error(y_val_fold, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_val_fold, y_pred)),
            'MAE': mean_absolute_error(y_val_fold, y_pred),
            'R²': r2_score(y_val_fold, y_pred)
        }
        fold_metrics.append(metrics)
        print(f"Fold {fold+1} metrics: MSE={metrics['MSE']:.4f}, R²={metrics['R²']:.4f}")
    
    # Calculate average metrics
    avg_metrics = {
        metric: np.mean([fold[metric] for fold in fold_metrics])
        for metric in fold_metrics[0].keys()
    }
    
    std_metrics = {
        metric: np.std([fold[metric] for fold in fold_metrics])
        for metric in fold_metrics[0].keys()
    }
    
    print("\nCross-validation results:")
    for metric in avg_metrics.keys():
        print(f"{metric}: {avg_metrics[metric]:.4f} ± {std_metrics[metric]:.4f}")
        
    # Plot cross-validation results
    plt.figure(figsize=(10, 6))
    metrics_to_plot = ['MSE', 'MAE', 'R²']
    for i, metric in enumerate(metrics_to_plot):
        plt.subplot(1, 3, i+1)
        values = [fold[metric] for fold in fold_metrics]
        plt.bar(range(1, n_splits+1), values)
        plt.title(f'{metric} by Fold')
        plt.xlabel('Fold')
        plt.ylabel(metric)
    plt.tight_layout()
    plt.show()
        
    return {
        'fold_metrics': fold_metrics,
        'avg_metrics': avg_metrics,
        'std_metrics': std_metrics
    }

In [80]:
def save_model(model, scaler, feature_selector, feature_names, model_path='stress_model'):
    """Save the model, scaler, and feature selector to disk"""
    # Create directory if it doesn't exist
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    # Save model
    model.save(f'{model_path}/model.h5')
    
    # Save scaler
    if scaler is not None:
        joblib.dump(scaler, f'{model_path}/scaler.pkl')
        
    # Save feature selector
    if feature_selector is not None:
        joblib.dump(feature_selector, f'{model_path}/feature_selector.pkl')
        
    # Save feature names
    if feature_names is not None:
        pd.Series(feature_names).to_csv(f'{model_path}/feature_names.csv', index=False)
        
    print(f"Model and preprocessors saved to {model_path}/")
    
    return f"Model saved to {model_path}/"

In [81]:
def load_saved_model(model_path='stress_model'):
    """Load the model, scaler, and feature selector from disk"""
    try:
        # Load model
        model = load_model(f'{model_path}/model.h5')
        print("Model loaded successfully")
        
        # Load scaler
        scaler = None
        scaler_path = f'{model_path}/scaler.pkl'
        if os.path.exists(scaler_path):
            scaler = joblib.load(scaler_path)
            print("Scaler loaded successfully")
            
        # Load feature selector
        feature_selector = None
        selector_path = f'{model_path}/feature_selector.pkl'
        if os.path.exists(selector_path):
            feature_selector = joblib.load(selector_path)
            print("Feature selector loaded successfully")
            
        # Load feature names
        feature_names = None
        feature_names_path = f'{model_path}/feature_names.csv'
        if os.path.exists(feature_names_path):
            feature_names = pd.read_csv(feature_names_path).iloc[:, 0].values
            print(f"Loaded {len(feature_names)} feature names")
            
        return model, scaler, feature_selector, feature_names
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, None, None

In [82]:
def predict_stress(model, new_data, scaler=None, feature_selector=None, feature_names=None):
    """Make stress predictions on new data"""
    # Convert to DataFrame if array
    if not isinstance(new_data, pd.DataFrame) and feature_names is not None:
        new_data = pd.DataFrame(new_data, columns=feature_names)
        
    # Ensure we have the right features
    if isinstance(new_data, pd.DataFrame) and feature_names is not None:
        missing_features = set(feature_names) - set(new_data.columns)
        if missing_features:
            print(f"Warning: Missing features in input data: {missing_features}")
            return None
            
        # Reorder columns to match training data
        new_data = new_data[feature_names]
        
    # Preprocess data
    if scaler is not None:
        new_data = scaler.transform(new_data)
        
    if feature_selector is not None:
        new_data = feature_selector.transform(new_data)
        
    # Make predictions
    predictions = model.predict(new_data).flatten()
    
    # Display summary statistics of predictions
    print("\nPrediction Results:")
    print(f"Mean predicted stress: {predictions.mean():.2f}")
    print(f"Min predicted stress: {predictions.min():.2f}")
    print(f"Max predicted stress: {predictions.max():.2f}")
    
    # Plot histogram of predictions
    plt.figure(figsize=(10, 6))
    plt.hist(predictions, bins=20, alpha=0.7, color='blue')
    plt.title('Distribution of Predicted Stress Scores')
    plt.xlabel('Predicted Stress Score')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return predictions

In [83]:
def plot_training_history(history):
    """Plot the training history"""
    plt.figure(figsize=(12, 5))
    
    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot MAE
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title('Mean Absolute Error')
    plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [84]:
def tune_hyperparameters(X, y, param_grid):
    """Simple hyperparameter tuning for the neural network model"""
    # Split data once for consistent evaluation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Scale data
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Initialize results
    results = []
    best_val_loss = float('inf')
    best_params = None
    best_model = None
    
    # Create parameter combinations
    def create_param_combinations(param_grid):
        keys = param_grid.keys()
        combinations = []
        
        # Recursive function to generate combinations
        def recurse(combination, keys_list):
            if not keys_list:
                combinations.append(combination.copy())
                return
            
            current_key = keys_list[0]
            for value in param_grid[current_key]:
                combination[current_key] = value
                recurse(combination, keys_list[1:])
        
        recurse({}, list(keys))
        return combinations
    
    # Get all parameter combinations
    param_combinations = create_param_combinations(param_grid)
    print(f"Testing {len(param_combinations)} hyperparameter combinations...")
    
    for i, params in enumerate(param_combinations):
        print(f"\nTesting combination {i+1}/{len(param_combinations)}: {params}")
        
        # Build model with current parameters
        input_dim = X_train_scaled.shape[1]
        model = Sequential()
        
        # Input layer
        if params.get('activation', 'relu') == 'leaky_relu':
            model.add(Dense(params['first_layer'], input_dim=input_dim))
            model.add(LeakyReLU(alpha=0.1))
        else:
            model.add(Dense(params['first_layer'], input_dim=input_dim, 
                           activation=params.get('activation', 'relu')))
            
        model.add(BatchNormalization())
        model.add(Dropout(params.get('dropout_rate', 0.3)))
        
        # Hidden layers (if specified)
        if 'second_layer' in params and params['second_layer'] > 0:
            if params.get('activation', 'relu') == 'leaky_relu':
                model.add(Dense(params['second_layer']))
                model.add(LeakyReLU(alpha=0.1))
            else:
                model.add(Dense(params['second_layer'], 
                               activation=params.get('activation', 'relu')))
                
            model.add(BatchNormalization())
            model.add(Dropout(params.get('dropout_rate', 0.3)))
        
        # Output layer
        model.add(Dense(1, activation='linear'))
        
        # Compile model
        optimizer = Adam(learning_rate=params.get('learning_rate', 0.001))
        model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
        
        # Train with early stopping
        callbacks = [EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)]
        history = model.fit(
            X_train_scaled, y_train,
            epochs=100,  # Max epochs
            batch_size=params.get('batch_size', 32),
            validation_data=(X_val_scaled, y_val),
            callbacks=callbacks,
            verbose=0
        )
        
        # Evaluate on validation set
        val_loss = model.evaluate(X_val_scaled, y_val, verbose=0)[0]
        y_pred = model.predict(X_val_scaled).flatten()
        val_r2 = r2_score(y_val, y_pred)
        
        print(f"Val Loss: {val_loss:.4f}, Val R²: {val_r2:.4f}")
        
        # Save results
        results.append({
            'params': params,
            'val_loss': val_loss,
            'val_r2': val_r2,
            'epochs_trained': len(history.history['loss'])
        })
        
        # Check if this is best so far
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_params = params
            best_model = model
            print(f"New best model found!")
    
    # Sort results by validation loss
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('val_loss')
    
    print("\nTop 5 hyperparameter combinations:")
    print(results_df.head(5)[['params', 'val_loss', 'val_r2']])
    
    print(f"\nBest hyperparameters: {best_params}")
    print(f"Best validation loss: {best_val_loss:.4f}")
    
    # Plot top combinations
    plt.figure(figsize=(12, 6))
    top_5 = results_df.head(5)
    plt.subplot(1, 2, 1)
    plt.bar(range(5), top_5['val_loss'])
    plt.title('Validation Loss - Top 5')
    plt.xlabel('Combination Rank')
    plt.ylabel('MSE')
    
    plt.subplot(1, 2, 2)
    plt.bar(range(5), top_5['val_r2'])
    plt.title('Validation R² - Top 5')
    plt.xlabel('Combination Rank')
    plt.ylabel('R²')
    
    plt.tight_layout()
    plt.show()
    
    return best_model, best_params, results_df

In [6]:
def example_workflow(file_path, target_col='stress_score_weighted'):
    """Example of how to use the functions in sequence"""
    # 1. Load data
    X, y = load_data(file_path, target_col=target_col)
    
    # 2. Preprocess data
    X_train, X_test, y_train, y_test, scaler, feature_selector = preprocess_data(
        X, y, scaler_type='robust'
    )
    
    # 3. Build model
    model = build_model(input_dim=X_train.shape[1])
    
    # 4. Train model
    trained_model, history = train_model(model, X_train, y_train)
    
    # 5. Evaluate model
    metrics, predictions = evaluate_model(trained_model, X_test, y_test)
    
    # 6. Save model if needed
    save_model(trained_model, scaler, feature_selector, X.columns.tolist())
    
    return trained_model, metrics, predictions

In [11]:
if __name__ == "__main__":
    # Specify the path to your dataset file
    data_file = "Final_merged_dataset_with_weighted_stress.csv"  # Default to sample data
    
    # If you have a specific column for stress scores, specify it here
    target_column = "stress_score_weighted"
    
    # Check if the user provided a CSV file as argument
    import sys
    if len(sys.argv) > 1:
        data_file = sys.argv[1]
        print(f"Using data file from command line argument: {data_file}")
    
    try:
        # Run the full workflow
        print("Starting stress prediction analysis...")
        model, metrics, predictions = example_workflow(data_file, target_column)
        print("\nAnalysis completed successfully!")
        
        # Print final performance metrics
        print("\nFinal model performance:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")
            
    except FileNotFoundError:
        print("\nERROR: Data file not found. Please update the 'data_file' variable with the correct path.")
        print("You can also try creating a sample dataset for testing.")
        
    except Exception as e:
        print(f"\nAn error occurred during analysis: {e}")
        
        # Provide options for debugging
        print("\nDebugging suggestions:")
        print("1. Check if your data file exists and is accessible")
        print("2. Verify that your data contains the expected target column")
        print("3. Inspect the first few rows of your data to ensure it's formatted correctly")
        print("4. Check for missing values or invalid data types in your dataset")

Using data file from command line argument: --f=/Users/anchitmehra/Library/Jupyter/runtime/kernel-v3804f025fc011677f1d0d131d8b0027e821468dc9.json
Starting stress prediction analysis...
Error loading data: [Errno 2] No such file or directory: '--f=/Users/anchitmehra/Library/Jupyter/runtime/kernel-v3804f025fc011677f1d0d131d8b0027e821468dc9.json'

ERROR: Data file not found. Please update the 'data_file' variable with the correct path.
You can also try creating a sample dataset for testing.
