# Feature Engineering

This section explains the feature engineering process and selection methods used.
Navigate: [Previous (Model Development)](02_model_development.ipynb) | [Next (API Deployment)](04_deploy.ipynb)

In [None]:
#Experiment 3
import os
import pandas as pd
import numpy as np
import sqlite3
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
import logging
from datetime import datetime

# Set up MLflow
os.environ['MLFLOW_TRACKING_USERNAME'] = 'ashiashish100'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '7af28a60cc2f6e231f6413c9b48e241766a2e931'
mlflow.set_tracking_uri("https://dagshub.com/ashiashish100/my-first-repo.mlflow")
mlflow.set_experiment("mental_health_feature_engineering")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_base_data():
    conn = sqlite3.connect('mental_health_final.db')
    query = """
    SELECT 
        e.age,
        e.gender,
        e.country,
        emp.company_size,
        emp.is_tech_company,
        emp.work_remotely,
        mhb.has_mental_health_benefits,
        mhh.current_disorder,
        mhh.sought_treatment,
        wc.discuss_with_supervisor,
        wc.discuss_with_coworkers,
        wc.observed_negative_consequences,
        wc.interferes_with_work
    FROM employees e
    LEFT JOIN employment emp ON e.employee_id = emp.employee_id
    LEFT JOIN mental_health_benefits mhb ON e.employee_id = mhb.employee_id
    LEFT JOIN mental_health_history mhh ON e.employee_id = mhh.employee_id
    LEFT JOIN workplace_communication wc ON e.employee_id = wc.employee_id
    """
    return pd.read_sql_query(query, conn)

def engineer_features(df):
    #Perform feature engineering
    df = df.copy()
    
    # 1. Age-based features
    df['age_group'] = pd.qcut(df['age'], q=5, labels=['very_young', 'young', 'middle', 'senior', 'very_senior'])
    
    # 2. Company size normalization
    size_map = {
        '1-25': 1,
        '26-100': 2,
        '100-500': 3,
        '500-1000': 4,
        '1000+': 5
    }
    df['company_size_normalized'] = df['company_size'].map(size_map)
    
    # 3. Communication comfort score
    df['communication_score'] = 0
    df.loc[df['discuss_with_supervisor'] == 'yes', 'communication_score'] += 1
    df.loc[df['discuss_with_coworkers'] == 'yes', 'communication_score'] += 1
    
    # 4. Remote work interaction
    df['remote_with_benefits'] = ((df['work_remotely'] != 'never') & 
                                 (df['has_mental_health_benefits'] == 'yes')).astype(int)
    
    # 5. Tech environment score
    df['tech_environment_score'] = df['is_tech_company'] * df['company_size_normalized']
    
    # 6. Workplace support index
    df['workplace_support_index'] = (
        (df['has_mental_health_benefits'] == 'yes').astype(int) +
        (df['discuss_with_supervisor'] == 'yes').astype(int) +
        (df['observed_negative_consequences'] == 'no').astype(int)
    )
    
    # 7. Work impact indicator
    df['work_impact_severity'] = pd.Categorical(
        df['interferes_with_work'],
        categories=['never', 'rarely', 'sometimes', 'often', 'always'],
        ordered=True
    ).codes
    
    # Handle missing values
    df = handle_missing_values(df)
    
    return df

def handle_missing_values(df):
    #Handle missing values in the dataset
    # Numeric columns
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    
    return df

def create_feature_sets():
    #Create different feature sets for experimentation
    feature_sets = {
        'base_features': [
            'age', 'gender', 'country', 'company_size', 'is_tech_company', 
            'work_remotely', 'has_mental_health_benefits', 'current_disorder'
        ],
        
        'engineered_only': [
            'age_group', 'company_size_normalized', 'communication_score',
            'remote_with_benefits', 'tech_environment_score', 
            'workplace_support_index', 'work_impact_severity'
        ],
        
        'combined_features': [
            'age', 'gender', 'country', 'is_tech_company', 'work_remotely',
            'company_size_normalized', 'communication_score',
            'workplace_support_index', 'work_impact_severity',
            'tech_environment_score'
        ],
        
        'communication_focused': [
            'discuss_with_supervisor', 'discuss_with_coworkers',
            'observed_negative_consequences', 'interferes_with_work',
            'communication_score', 'workplace_support_index'
        ]
    }
    return feature_sets

def run_experiment(X, y, feature_set_name, features):
    #Run experiment with specific feature set
    with mlflow.start_run(run_name=f"feature_eng_{feature_set_name}"):
        # Log feature set
        mlflow.log_param("feature_set", feature_set_name)
        mlflow.log_param("n_features", len(features))
        mlflow.log_param("features", ", ".join(features))
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X[features], y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Create preprocessing pipeline
        numeric_features = X[features].select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X[features].select_dtypes(include=['object']).columns
        
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ])
        
        # Create and train pipeline
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(random_state=42))
        ])
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
        
        # Train and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred)
        }
        
        # Log metrics
        mlflow.log_metric("cv_f1_mean", cv_scores.mean())
        mlflow.log_metric("cv_f1_std", cv_scores.std())
        for metric_name, metric_value in metrics.items():
            mlflow.log_metric(f"test_{metric_name}", metric_value)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Print results
        logger.info(f"\nResults for {feature_set_name}:")
        logger.info(f"CV F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
        logger.info("Test Metrics:")
        for metric_name, metric_value in metrics.items():
            logger.info(f"{metric_name}: {metric_value:.4f}")
        
        return model, metrics

def main():
    # Load data
    logger.info("Loading data...")
    df = load_base_data()
    
    # Engineer features
    logger.info("Engineering features...")
    df_engineered = engineer_features(df)
    
    # Prepare target
    y = df_engineered['sought_treatment']
    
    # Get feature sets
    feature_sets = create_feature_sets()
    
    # Run experiments for each feature set
    for set_name, features in feature_sets.items():
        logger.info(f"\nRunning experiment with {set_name}")
        run_experiment(df_engineered, y, set_name, features)

if __name__ == "__main__":
    main()

In [None]:
#Experiment 4
import os
import pandas as pd
import numpy as np
import sqlite3
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
import logging
import seaborn as sns
import matplotlib.pyplot as plt

# Set up MLflow
os.environ['MLFLOW_TRACKING_USERNAME'] = 'ashiashish100'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '7af28a60cc2f6e231f6413c9b48e241766a2e931'
mlflow.set_tracking_uri("https://dagshub.com/ashiashish100/my-first-repo.mlflow")
mlflow.set_experiment("mental_health_feature_selection")

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_data():
    conn = sqlite3.connect('mental_health_final.db')
    query = """
    SELECT 
        e.age,
        e.gender,
        e.country,
        emp.company_size,
        emp.is_tech_company,
        emp.work_remotely,
        mhb.has_mental_health_benefits,
        mhh.current_disorder,
        mhh.sought_treatment,
        wc.discuss_with_supervisor,
        wc.discuss_with_coworkers,
        wc.observed_negative_consequences,
        wc.interferes_with_work
    FROM employees e
    LEFT JOIN employment emp ON e.employee_id = emp.employee_id
    LEFT JOIN mental_health_benefits mhb ON e.employee_id = mhb.employee_id
    LEFT JOIN mental_health_history mhh ON e.employee_id = mhh.employee_id
    LEFT JOIN workplace_communication wc ON e.employee_id = wc.employee_id
    """
    return pd.read_sql_query(query, conn)

def preprocess_data(df):
    df = df.copy()
    
    # Handle missing values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    
    return df

def correlation_based_selection(X, y, threshold=0.1):
    #Select features based on correlation with target
    # Create a copy of the dataframe
    X_processed = X.copy()
    
    # Process numeric columns only
    numeric_cols = X_processed.select_dtypes(include=['int64', 'float64']).columns
    
    # Calculate correlations for numeric features only
    correlations = []
    for col in numeric_cols:
        corr = abs(np.corrcoef(X_processed[col], y)[0, 1])
        correlations.append((col, corr))
    
    # Sort by correlation
    correlations.sort(key=lambda x: x[1], reverse=True)
    
    # Select features above threshold
    selected_features = [col for col, corr in correlations if corr > threshold]
    
    # Add categorical columns that might be important
    categorical_cols = X_processed.select_dtypes(include=['object']).columns
    selected_features.extend(categorical_cols)
    
    # Ensure at least some features are selected
    if not selected_features:
        # Take top 5 features if none meet threshold
        selected_features = [col for col, _ in correlations[:5]]
        selected_features.extend(categorical_cols)
    
    logger.info(f"Number of features selected: {len(selected_features)}")
    logger.info(f"Selected features: {selected_features}")
    
    return list(set(selected_features)), correlations

def variance_based_selection(X, threshold=0.01):
    #Select features based on variance
    selector = VarianceThreshold(threshold=threshold)
    
    # Apply to numeric features only
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
    selector.fit(X[numeric_cols])
    
    # Get selected features
    selected_features = numeric_cols[selector.get_support()].tolist()
    
    return selected_features, selector

def importance_based_selection(X, y, n_features=10):
    #Select features based on importance scores
    # Create a copy for preprocessing
    X_processed = X.copy()
    
    # Get numeric and categorical columns
    numeric_features = X_processed.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X_processed.select_dtypes(include=['object']).columns
    
    # Handle categorical features
    for col in categorical_features:
        # Convert to categorical codes
        X_processed[col] = pd.Categorical(X_processed[col]).codes
    
    # Create and fit random forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_processed, y)
    
    # Get feature importance scores
    importances = list(zip(X.columns, rf.feature_importances_))
    importances.sort(key=lambda x: x[1], reverse=True)
    
    # Select top n features
    selected_features = [feat for feat, imp in importances[:n_features]]
    
    logger.info(f"Top {n_features} important features:")
    for feat, imp in importances[:n_features]:
        logger.info(f"{feat}: {imp:.4f}")
    
    return selected_features, importances

def run_experiment(X, y, selected_features, selection_method):
    #Run experiment with selected features
    # Check if we have features selected
    if not selected_features:
        logger.error(f"No features selected for {selection_method}")
        return None, None
        
    with mlflow.start_run(run_name=f"feature_selection_{selection_method}"):
        try:
            # Log parameters
            mlflow.log_param("selection_method", selection_method)
            mlflow.log_param("n_selected_features", len(selected_features))
            mlflow.log_param("selected_features", ", ".join(selected_features))
            
            # Split data
            X_selected = X[selected_features]
            X_train, X_test, y_train, y_test = train_test_split(
                X_selected, y, test_size=0.2, random_state=42, stratify=y
            )
            
            # Create pipeline
            numeric_features = X_selected.select_dtypes(include=['int64', 'float64']).columns
            categorical_features = X_selected.select_dtypes(include=['object']).columns
            
            transformers = []
            if len(numeric_features) > 0:
                transformers.append(('num', StandardScaler(), numeric_features))
            if len(categorical_features) > 0:
                transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features))
            
            preprocessor = ColumnTransformer(transformers=transformers)
            
            model = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', LogisticRegression(random_state=42))
            ])
            
            # Cross-validation with error handling
            try:
                cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
            except Exception as e:
                logger.error(f"Cross-validation failed: {str(e)}")
                cv_scores = np.array([0])
            
            # Train and evaluate
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'f1_score': f1_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred)
            }
            
            # Log metrics
            mlflow.log_metric("cv_f1_mean", cv_scores.mean())
            mlflow.log_metric("cv_f1_std", cv_scores.std())
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"test_{metric_name}", metric_value)
            
            # Log model
            mlflow.sklearn.log_model(model, "model")
            
            # Print results
            logger.info(f"\nResults for {selection_method}:")
            logger.info(f"Number of selected features: {len(selected_features)}")
            logger.info(f"Selected features: {selected_features}")
            logger.info(f"CV F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
            logger.info("Test Metrics:")
            for metric_name, metric_value in metrics.items():
                logger.info(f"{metric_name}: {metric_value:.4f}")
            
            return model, metrics
            
        except Exception as e:
            logger.error(f"Error in experiment {selection_method}: {str(e)}")
            return None, None

def plot_feature_importance(importances, title="Feature Importance"):
    #Plot feature importance scores
    plt.figure(figsize=(10, 6))
    importance_df = pd.DataFrame(importances, columns=['feature', 'importance'])
    sns.barplot(x='importance', y='feature', data=importance_df)
    plt.title(title)
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    return 'feature_importance.png'

def main():
    # Load and preprocess data
    logger.info("Loading data...")
    df = load_data()
    df = preprocess_data(df)
    
    # Prepare features and target
    X = df.drop('sought_treatment', axis=1)
    y = df['sought_treatment']
    
    # 1. Correlation-based selection
    logger.info("\nPerforming correlation-based selection...")
    corr_features, correlations = correlation_based_selection(X, y, threshold=0.3)
    run_experiment(X, y, corr_features, "correlation_based")
    
    # 2. Variance-based selection
    logger.info("\nPerforming variance-based selection...")
    var_features, var_selector = variance_based_selection(X, threshold=0.01)
    run_experiment(X, y, var_features, "variance_based")
    
    # 3. Importance-based selection
    logger.info("\nPerforming importance-based selection...")
    imp_features, importances = importance_based_selection(X, y, n_features=10)
    
    # Plot and log feature importance
    with mlflow.start_run(run_name="feature_importance_analysis"):
        importance_plot = plot_feature_importance(importances)
        mlflow.log_artifact(importance_plot)
    
    run_experiment(X, y, imp_features, "importance_based")

if __name__ == "__main__":
    main()

In [None]:
#Experiment 5
import os
import pandas as pd
import numpy as np
import sqlite3
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
import logging
import matplotlib.pyplot as plt
import seaborn as sns

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up MLflow tracking
os.environ['MLFLOW_TRACKING_USERNAME'] = 'ashiashish100'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '7af28a60cc2f6e231f6413c9b48e241766a2e931'
mlflow.set_tracking_uri("https://dagshub.com/ashiashish100/my-first-repo.mlflow")
mlflow.set_experiment("mental_health_pca")

def load_data():
    #Load data from database
    conn = sqlite3.connect('mental_health_final.db')
    try:
        query = """
        SELECT 
            e.age,
            e.gender,
            e.country,
            emp.company_size,
            emp.is_tech_company,
            emp.work_remotely,
            mhb.has_mental_health_benefits,
            mhh.current_disorder,
            mhh.sought_treatment,
            wc.discuss_with_supervisor,
            wc.discuss_with_coworkers,
            wc.observed_negative_consequences,
            wc.interferes_with_work
        FROM employees e
        LEFT JOIN employment emp ON e.employee_id = emp.employee_id
        LEFT JOIN mental_health_benefits mhb ON e.employee_id = mhb.employee_id
        LEFT JOIN mental_health_history mhh ON e.employee_id = mhh.employee_id
        LEFT JOIN workplace_communication wc ON e.employee_id = wc.employee_id
        """
        df = pd.read_sql_query(query, conn)
        return df
    except Exception as e:
        logger.error(f"Error loading data from database: {e}")
        raise
    finally:
        conn.close()

def standardize_gender(df):
    #Standardize gender categories
    df = df.copy()
    
    # Convert to lowercase
    df['gender'] = df['gender'].str.lower()
    
    # Create mapping for gender standardization
    gender_map = {
        'male': 'male',
        'm': 'male',
        'man': 'male',
        'cis male': 'male',
        'male ': 'male',
        'cisdude': 'male',
        'm|': 'male',
        'female': 'female',
        'f': 'female',
        'woman': 'female',
        'cis female': 'female',
        'female ': 'female'
    }
    
    # Apply mapping and group all other values as 'other'
    df['gender'] = df['gender'].apply(lambda x: gender_map.get(str(x).lower(), 'other'))
    
    return df

def preprocess_data(df):
    #Preprocess the data
    df = df.copy()
    
    # Standardize gender
    df = standardize_gender(df)
    
    # Handle missing values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    
    return df

def create_scree_plot(explained_variance_ratio):
    #Create and save scree plot
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(explained_variance_ratio) + 1), 
             explained_variance_ratio, 'bo-')
    plt.plot(range(1, len(explained_variance_ratio) + 1),
             np.cumsum(explained_variance_ratio), 'ro-')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Scree Plot')
    plt.xticks(range(1, len(explained_variance_ratio) + 1))
    plt.legend(['Individual', 'Cumulative'])
    plt.grid(True)
    plt.tight_layout()
    
    # Save plot
    plt.savefig('scree_plot.png')
    plt.close()
    
    return 'scree_plot.png'

def select_n_components(explained_variance_ratio, threshold=0.95):
    #Select number of components based on cumulative explained variance
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1
    return n_components

def run_pca_experiment(X, y):
    #Run PCA experiment with automatic component selection
    with mlflow.start_run(run_name="pca_analysis"):
        try:
            # Preprocess data first
            X = preprocess_data(X)
            
            # Create preprocessing pipeline for all features
            numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
            categorical_features = X.select_dtypes(include=['object']).columns
            
            # Create preprocessor with handle_unknown='ignore'
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numeric_features),
                    ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features)
                ])
            
            # Fit preprocessor and transform data
            X_preprocessed = preprocessor.fit_transform(X)
            
            # Perform PCA
            pca = PCA()
            X_pca = pca.fit_transform(X_preprocessed)
            
            # Create and save scree plot
            scree_plot_path = create_scree_plot(pca.explained_variance_ratio_)
            mlflow.log_artifact(scree_plot_path)
            
            # Select number of components
            n_components = select_n_components(pca.explained_variance_ratio_)
            logger.info(f"Selected {n_components} components explaining 95% of variance")
            
            # Log PCA information
            mlflow.log_param("n_components", n_components)
            mlflow.log_metric("explained_variance_ratio", 
                             np.sum(pca.explained_variance_ratio_[:n_components]))
            
            # Create final pipeline with selected components
            pca_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('pca', PCA(n_components=n_components)),
                ('classifier', LogisticRegression(random_state=42))
            ])
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
            
            # Perform cross-validation
            cv_scores = cross_val_score(pca_pipeline, X_train, y_train, cv=5, scoring='f1')
            
            # Train final model
            pca_pipeline.fit(X_train, y_train)
            y_pred = pca_pipeline.predict(X_test)
            
            # Calculate metrics
            metrics = {
                'accuracy': accuracy_score(y_test, y_pred),
                'f1_score': f1_score(y_test, y_pred),
                'precision': precision_score(y_test, y_pred),
                'recall': recall_score(y_test, y_pred)
            }
            
            # Log metrics
            mlflow.log_metric("cv_f1_mean", cv_scores.mean())
            mlflow.log_metric("cv_f1_std", cv_scores.std())
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(f"test_{metric_name}", metric_value)
            
            # Log model
            mlflow.sklearn.log_model(pca_pipeline, "model")
            
            # Create component analysis plot
            plt.figure(figsize=(12, 6))
            component_variance = pd.DataFrame({
                'Component': range(1, len(pca.explained_variance_ratio_) + 1),
                'Explained Variance': pca.explained_variance_ratio_
            })
            sns.barplot(data=component_variance.head(10), 
                       x='Component', y='Explained Variance')
            plt.title('Top 10 Principal Components - Explained Variance')
            plt.tight_layout()
            plt.savefig('component_analysis.png')
            mlflow.log_artifact('component_analysis.png')
            plt.close()
            
            # Print results
            logger.info("\nPCA Results:")
            logger.info(f"Number of components selected: {n_components}")
            logger.info(f"Total explained variance: {np.sum(pca.explained_variance_ratio_[:n_components]):.4f}")
            logger.info(f"CV F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
            logger.info("Test Metrics:")
            for metric_name, metric_value in metrics.items():
                logger.info(f"{metric_name}: {metric_value:.4f}")
            
            return pca_pipeline, metrics
        
        except Exception as e:
            logger.error(f"An error occurred during PCA experiment: {e}")
            raise

def main():
    # Load data
    logger.info("Loading data...")
    df = load_data()
    
    # Prepare features and target
    X = df.drop('sought_treatment', axis=1)
    y = df['sought_treatment']
    
    # Run PCA experiment
    logger.info("\nRunning PCA experiment...")
    model, metrics = run_pca_experiment(X, y)

if __name__ == "__main__":
    main()

In [None]:
#Experiment 6 & 7
import os
import pandas as pd
import numpy as np
import sqlite3
import logging
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    confusion_matrix, 
    classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
os.environ['MLFLOW_TRACKING_USERNAME'] = 'ashiashish100'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '7af28a60cc2f6e231f6413c9b48e241766a2e931'
mlflow.set_tracking_uri("https://dagshub.com/ashiashish100/my-first-repo.mlflow")
mlflow.set_experiment("mental_health_communication_factors")

def load_and_prepare_data():
    # Database connection
    conn = sqlite3.connect('mental_health_final.db')
    
    try:
        # Comprehensive query to extract relevant features
        query = """
        SELECT 
            e.age,
            e.gender,
            e.country,
            emp.company_size,
            emp.is_tech_company,
            emp.work_remotely,
            mhb.has_mental_health_benefits,
            mhh.current_disorder,
            mhh.sought_treatment,
            wc.discuss_with_supervisor,
            wc.discuss_with_coworkers,
            wc.observed_negative_consequences,
            wc.interferes_with_work
        FROM employees e
        LEFT JOIN employment emp ON e.employee_id = emp.employee_id
        LEFT JOIN mental_health_benefits mhb ON e.employee_id = mhb.employee_id
        LEFT JOIN mental_health_history mhh ON e.employee_id = mhh.employee_id
        LEFT JOIN workplace_communication wc ON e.employee_id = wc.employee_id
        """
        
        # Read data
        df = pd.read_sql_query(query, conn)
        
        # Separate features and target
        X = df.drop('sought_treatment', axis=1)
        y = df['sought_treatment']
        
        return X, y
    
    except Exception as e:
        logger.error(f"Data loading error: {e}")
        raise
    
    finally:
        conn.close()

def preprocess_data(X):
    # Standardize gender
    def standardize_gender(series):
        gender_map = {
            'male': 'male', 'm': 'male', 'man': 'male', 
            'cis male': 'male', 'male ': 'male', 
            'female': 'female', 'f': 'female', 'woman': 'female', 
            'cis female': 'female', 'female ': 'female'
        }
        return series.str.lower().map(lambda x: gender_map.get(x, 'other'))
    
    # Create a copy of the dataframe
    df = X.copy()
    
    # Standardize gender column
    df['gender'] = standardize_gender(df['gender'])
    
    # Handle missing values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with median
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # Fill categorical columns with 'unknown'
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    
    return df

def create_communication_interaction_features(X):
    df = X.copy()
    
    # Ensure numeric conversion with error handling
    def safe_numeric_convert(series):
        try:
            return pd.to_numeric(series, errors='coerce').fillna(0)
        except Exception as e:
            print(f"Conversion error: {e}")
            return np.zeros(len(series))
    
    # Interaction between discussing with supervisor and coworkers
    discuss_supervisor = safe_numeric_convert(df['discuss_with_supervisor'])
    discuss_coworkers = safe_numeric_convert(df['discuss_with_coworkers'])
    df['communication_openness'] = discuss_supervisor * discuss_coworkers
    
    # Workplace impact interaction
    interferes_work = safe_numeric_convert(df['interferes_with_work'])
    negative_consequences = safe_numeric_convert(df['observed_negative_consequences'])
    df['workplace_mental_health_impact'] = interferes_work * negative_consequences
    
    return df

def run_communication_experiment(X, y):
    # Start MLflow run
    with mlflow.start_run(run_name="communication_factors_analysis"):
        try:
            # Preprocess data
            X_processed = preprocess_data(X)
            
            # Create interaction features
            X_with_interactions = create_communication_interaction_features(X_processed)
            
            # Identify feature types
            numeric_features = X_with_interactions.select_dtypes(include=['int64', 'float64']).columns
            categorical_features = X_with_interactions.select_dtypes(include=['object']).columns
            
            # Create preprocessing pipeline
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numeric_features),
                    ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features)
                ])
            
            # Create full pipeline with Random Forest
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', RandomForestClassifier(
                    n_estimators=100, 
                    random_state=42, 
                    class_weight='balanced'
                ))
            ])
            
            # Stratified K-Fold Cross-Validation
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            
            # Perform cross-validation
            cv_scores = cross_val_score(pipeline, X_with_interactions, y, cv=cv, scoring='f1')
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X_with_interactions, y, test_size=0.2, stratify=y, random_state=42
            )
            
            # Fit the pipeline
            pipeline.fit(X_train, y_train)
            
            # Predictions
            y_pred = pipeline.predict(X_test)
            
            # Detailed metrics
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
            
            # Confusion Matrix
            cm = confusion_matrix(y_test, y_pred)
            
            # Log metrics
            mlflow.log_metrics({
                'cv_f1_mean': cv_scores.mean(),
                'cv_f1_std': cv_scores.std(),
                'test_precision': precision,
                'test_recall': recall,
                'test_f1_score': f1
            })
            
            # Feature importance visualization
            feature_names = (
                list(numeric_features) + 
                list(pipeline.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .get_feature_names_out(categorical_features))
            )
            
            # Extract feature importances
            importances = pipeline.named_steps['classifier'].feature_importances_
            indices = np.argsort(importances)[::-1]
            
            # Plot feature importances
            plt.figure(figsize=(10, 6))
            plt.title("Feature Importances in Mental Health Treatment Seeking")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), 
                       [feature_names[i] for i in indices], 
                       rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            mlflow.log_artifact('feature_importances.png')
            
            # Confusion Matrix Visualization
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title('Confusion Matrix')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.tight_layout()
            plt.savefig('confusion_matrix.png')
            mlflow.log_artifact('confusion_matrix.png')
            
            # Log the model
            mlflow.sklearn.log_model(pipeline, "communication_factors_model")
            
            # Print and log detailed results
            logger.info("\nExperiment Results:")
            logger.info(f"Cross-Validation F1 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
            logger.info(f"Test Precision: {precision:.4f}")
            logger.info(f"Test Recall: {recall:.4f}")
            logger.info(f"Test F1 Score: {f1:.4f}")
            
            # Classification Report
            class_report = classification_report(y_test, y_pred)
            logger.info("\nClassification Report:\n" + class_report)
            
            return pipeline, {
                'cv_f1_mean': cv_scores.mean(),
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            }
        
        except Exception as e:
            logger.error(f"Experiment failed: {e}")
            raise

def main():
    # Load data
    logger.info("Loading and preparing data...")
    X, y = load_and_prepare_data()
    
    # Run experiment
    logger.info("\nRunning communication factors experiment...")
    model, metrics = run_communication_experiment(X, y)
    
    return model, metrics

if __name__ == "__main__":
    main()

In [None]:
#F1 score plots to compare and determin the best model
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

def plot_f1_scores(results):
    """Create comparison plots for F1 scores"""
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Box plot of CV F1 scores
    plt.subplot(2, 2, 1)
    cv_data = []
    labels = []
    for scaler, result in results.items():
        cv_data.extend(result['cv_scores'])
        labels.extend([scaler] * len(result['cv_scores']))
    
    cv_df = pd.DataFrame({
        'Scaler': labels,
        'F1 Score': cv_data
    })
    
    sns.boxplot(x='Scaler', y='F1 Score', data=cv_df, palette='Set2')
    plt.title('Distribution of CV F1 Scores by Scaler', fontsize=14)
    plt.xticks(rotation=45)
    
    # 2. Bar plot with error bars
    plt.subplot(2, 2, 2)
    means = [result['cv_scores'].mean() for result in results.values()]
    stds = [result['cv_scores'].std() for result in results.values()]
    
    bars = plt.bar(results.keys(), means, yerr=stds, capsize=5, color=['lightblue', 'lightgreen', 'salmon'])
    plt.title('Mean CV F1 Scores with Standard Deviation', fontsize=14)
    plt.xlabel('Scaler Type', fontsize=12)
    plt.ylabel('F1 Score', fontsize=12)
    plt.xticks(rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom')
    
    # 3. Line plot showing CV scores across folds
    plt.subplot(2, 2, 3)
    markers = ['o', 's', '^']
    colors = ['blue', 'green', 'red']
    for (scaler, result), marker, color in zip(results.items(), markers, colors):
        plt.plot(range(1, len(result['cv_scores']) + 1), 
                result['cv_scores'], 
                marker=marker,
                color=color,
                label=scaler, 
                linewidth=2)
    
    plt.title('F1 Scores Across CV Folds', fontsize=14)
    plt.xlabel('Fold Number', fontsize=12)
    plt.ylabel('F1 Score', fontsize=12)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 4. Combined performance plot
    plt.subplot(2, 2, 4)
    x = np.arange(len(results))
    width = 0.35
    
    cv_means = [result['cv_scores'].mean() for result in results.values()]
    cv_stds = [result['cv_scores'].std() for result in results.values()]
    test_scores = [result['test_score'] for result in results.values()]
    
    bars1 = plt.bar(x - width/2, cv_means, width, label='CV Score', color='lightblue')
    bars2 = plt.bar(x + width/2, test_scores, width, label='Test Score', color='lightgreen')
    
    plt.errorbar(x - width/2, cv_means, yerr=cv_stds, fmt='none', color='black', capsize=5)
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.3f}',
                    ha='center', va='bottom')
    
    plt.xticks(x, results.keys(), rotation=45)
    plt.title('CV vs Test Performance Comparison', fontsize=14)
    plt.xlabel('Scaler Type', fontsize=12)
    plt.ylabel('F1 Score', fontsize=12)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('f1_score_comparison.png', dpi=300, bbox_inches='tight')
    
    # Print summary statistics
    print("\nPerformance Summary:")
    print("-" * 50)
    for scaler, result in results.items():
        print(f"\n{scaler.upper()} Scaler:")
        print(f"CV F1 Score: {result['cv_scores'].mean():.4f} ± {result['cv_scores'].std():.4f}")
        print(f"Test F1 Score: {result['test_score']:.4f}")
    
    # Determine best model
    best_scaler = max(results.items(), key=lambda x: x[1]['cv_scores'].mean())
    print("\nBest Model:")
    print("-" * 50)
    print(f"Scaler: {best_scaler[0]}")
    print(f"CV F1 Score: {best_scaler[1]['cv_scores'].mean():.4f} ± {best_scaler[1]['cv_scores'].std():.4f}")
    print(f"Test F1 Score: {best_scaler[1]['test_score']:.4f}")

def main():
    # Load your preprocessed data (using your existing load_and_preprocess_data function)
    df = load_and_preprocess_data()
    
    # Prepare features and target
    X = df.drop('sought_treatment', axis=1)
    y = df['sought_treatment']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Initialize results dictionary
    results = {}
    scalers = ['standard', 'minmax', 'log']
    
    # Perform experiments
    for scaler in scalers:
        # Create and train pipeline
        pipeline = create_pipeline(scaler)
        
        # Perform cross-validation
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1')
        
        # Train on full training set and evaluate on test set
        pipeline.fit(X_train, y_train)
        test_score = f1_score(y_test, pipeline.predict(X_test))
        
        # Store results
        results[scaler] = {
            'cv_scores': cv_scores,
            'test_score': test_score,
            'pipeline': pipeline
        }
    
    # Create plots and print summary
    plot_f1_scores(results)

if __name__ == "__main__":
    main()