In [None]:
def get_model_comparison_table(experiment_name="Telecom_customer_churn", tag_filters=None):
    """
    Generate a comparison table for model performance metrics within a specified MLflow experiment,
    with an option to filter by tags.

    Parameters:
    - experiment_name (str): The name of the MLflow experiment to retrieve the model runs from. 
      Default is "Telecom_customer_churn".
    - tag_filters (dict): A dictionary of tags to filter runs by, e.g., {"feature_selection": "SelectKBest"}.

    Returns:
    - pd.DataFrame: A DataFrame containing the model name and selected metrics for each run 
      in the specified experiment.
    """
    # Get the experiment ID
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    # Retrieve all runs in the experiment
    runs = mlflow.search_runs(experiment_ids=[experiment_id])

    # Filter runs based on tag filters, if specified
    if tag_filters:
        for tag_key, tag_value in tag_filters.items():
            runs = runs[runs[f"tags.{tag_key}"] == tag_value]

    # Define the metrics you want to include in the comparison
    metrics_to_include = [
        "accuracy", "accuracy_cv", "precision_yes","recall_yes","precision_no", "recall_no", 
        "roc_auc_val", "roc_auc_cv", "f1_score_no", "f1_score_yes", 
        "weighted_avg_precision", "weighted_avg_recall", "weighted_avg_f1_score"
    ]
    
    # Create an empty list to store each run's data
    model_data = []

    # Loop through each run to retrieve metrics and parameters
    for _, run in runs.iterrows():
        # Get the model name from parameters or the run name
        model_name = run.get("tags.mlflow.runName", "Unknown Model")
        
        # Extract the metrics
        metrics = {metric: run.get(f"metrics.{metric}", None) for metric in metrics_to_include}
        
        # Add model name to the metrics dictionary
        metrics["model_name"] = model_name
        
        # Append to the list
        model_data.append(metrics)

    # Create a DataFrame from the list
    comparison_df = pd.DataFrame(model_data)

    # Order columns to show model name first
    comparison_df = comparison_df[["model_name"] + metrics_to_include]

    return comparison_df


In [None]:
log_pipeline_results(pipeline, X_train, y_train, X_val, y_val, run_name="Gradient_Boosting_Model_K_Best_features", tags={"feature_selection": "SelectKBest"})

In [None]:
def log_pipeline_results(pipeline, X_train, y_train, X_val, y_val, run_name, cv_folds=5, tags=None):
    """
    Log the current pipeline configuration and results to MLflow with optional tags.
    
    Parameters:
    - pipeline: The sklearn pipeline, already configured with preprocessing and classifier.
    - X_train, y_train: Training data and labels.
    - X_val, y_val: Validation data and labels.
    - run_name: Name of the MLflow run for easy identification.
    - cv_folds: Number of folds for cross-validation.
    - tags (dict): Optional dictionary of tags to attach to the run, e.g., {"feature_selection": "SelectKBest"}.
    """
    # Set a default tag if none is provided
    if tags is None:
        tags = {"feature_selection": "no_changes"}  # default tag for baseline runs
    
    with mlflow.start_run(run_name=run_name):
        # Set tags for this run
        for tag_key, tag_value in tags.items():
            mlflow.set_tag(tag_key, tag_value)
        
        # (The rest of your function remains the same...)
        # Example code for logging metrics and parameters here...

        # Cross-validation setup
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        accuracy_cv = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy').mean()
        
        # Cross-validated predictions for AUC calculations
        y_train_pred_proba = cross_val_predict(pipeline, X_train, y_train, cv=cv, method="predict_proba")
        
        # Calculate cross-validated ROC-AUC for the positive class (e.g., "Yes" class)
        roc_auc_cv = roc_auc_score(y_train, y_train_pred_proba[:, 1])
        mlflow.log_metric("roc_auc_cv", roc_auc_cv)
        
        # Calculate cross-validated Precision-Recall AUC for the positive class
        precision, recall, _ = precision_recall_curve(y_train, y_train_pred_proba[:, 1], pos_label="Yes")
        pr_auc_cv = auc(recall, precision)
        mlflow.log_metric("pr_auc_cv", pr_auc_cv)
        
        # Fit the pipeline on the training set
        pipeline.fit(X_train, y_train)
        
        # Predict on the validation set
        y_val_pred = pipeline.predict(X_val)
        y_val_pred_proba = pipeline.predict_proba(X_val)[:, 1]
        
        # Calculate validation metrics
        accuracy = accuracy_score(y_val, y_val_pred)
        roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)
        
        # Generate classification report for detailed metrics
        report = classification_report(y_val, y_val_pred, output_dict=True)
        
        # Log overall accuracy and ROC-AUC on validation set
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("roc_auc_val", roc_auc_val)
        mlflow.log_metric("accuracy_cv", accuracy_cv)
        
        # Log detailed class-specific metrics (for 'No' and 'Yes' labels)
        for label in ["No", "Yes"]:
            mlflow.log_metric(f"precision_{label.lower()}", report[label]['precision'])
            mlflow.log_metric(f"recall_{label.lower()}", report[label]['recall'])
            mlflow.log_metric(f"f1_score_{label.lower()}", report[label]['f1-score'])
        
        # Log macro and weighted averages
        for avg_type in ["macro avg", "weighted avg"]:
            mlflow.log_metric(f"{avg_type.replace(' ', '_')}_precision", report[avg_type]['precision'])
            mlflow.log_metric(f"{avg_type.replace(' ', '_')}_recall", report[avg_type]['recall'])
            mlflow.log_metric(f"{avg_type.replace(' ', '_')}_f1_score", report[avg_type]['f1-score'])

        # Log model name and parameters
        classifier = pipeline.named_steps['classifier']
        mlflow.log_param("model", classifier.__class__.__name__)
        for param, value in classifier.get_params().items():
            mlflow.log_param(param, value)
        
        # Log feature importances if available
       # if hasattr(classifier, 'feature_importances_'):
            #feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
            #feature_importances = classifier.feature_importances_
            #importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
            #mlflow.log_text(importance_df.to_string(), "feature_importances.txt")
        
        # Log the entire pipeline as a model
        mlflow.sklearn.log_model(pipeline, "pipeline_model")
        
        print(f"Logged {run_name} with validation accuracy: {accuracy * 100:.2f}%, ROC-AUC (CV): {roc_auc_cv:.4f}, and PR-AUC (CV): {pr_auc_cv:.4f}")

1. As it can be seen all the models demonstrate relatively high accuracy, indicating they perform well overall. However, they struggle to accurately classify the "Yes" class, as shown by the lower precision, recall, and F1 scores for "Yes" across the board. This suggests that, while the models are good at identifying "No" cases, they have difficulty correctly identifying "Yes" classes. Which in our case is not ideal, as the aim is to predict customer churn.
2. Summary of models:
The Logistic Regression Model is the great overall, with high accuracy and balanced metrics, but the SVC is a strong alternative for slightly better churn detection, though with more false positives; Random Forest and Gradient Boosting show decent performance but miss more churn cases, and Decision Tree is the least effective due to lower accuracy and poor recall for churn cases.
3. Feature importances: Random Forest, Gradient Boosting and Decission Tree have the attribute feature_importances that helps us understand which input features (variables) are most influential in making predictions for a model. A text file with the feature importances can be found inside one of the three models pages in Mlflow. Example:(Click on Gradient_Boosting_Model and then navigate to the Artifacts section. File name feature_importances.txt). But here I will summarize the findings and compare them with my previous Chi squared tests and Cramer's V to see whether they are the same
4. The most important features for predicting churn across models include tenure, monthly charges, fiber optic internet service, contract duration, and electronic check payment method, while moderately important features are online security, tech support, streaming services, paperless billing, and having dependents or a partner.
5. There is substantial overlap between the features identified as important by Chi-Square, Cramér's V, and model feature importance scores. Key features like contract, internet_service_Fiber_optic, and num_tenure consistently appear as strong predictors of churn, making them robust variables to focus on in your analysis and model training.

### Models with only the best features:(SelectKbest)
1. Now I will run the same models using only the best features. Now we saw which features are important, but instead of explicitly selecting those features I will just add another step in the pipeline, which I will call feature_select, and I will do that with the function (Select K Best).
2. SelectKBest is a feature selection method in machine learning that helps reduce the number of input features by selecting only the most important ones based on a scoring function. It’s particularly useful for simplifying models, improving performance, and reducing overfitting, especially when dealing with high-dimensional data.
3. In the pipeline I will simply add another line of code under the preprocessor tuple: ('feature_selection', SelectKBest(k = 10), this will select only the 10 most significant features to use to run the model.
4. And when I log the run I will add one more addition. tags= "feature_selection": "SelectKBest"
5. Also I will comment the section of my function that gets the feature importances as we already know that and using SelectKbest makes it redundant.
6. I have decided to now proceed with testing only the logistic regression, SVC and Gradient boosting as they showed best performance and I want to see how these alterations influence their performance before I get to the best one.
7. Again I will provide a table of the models metrics. I will try the models with only the most important features and then add the rest moderately important. SelectKbest = 6 and then 10. This is more than half the features, so let's see what is going to happen.

In [None]:
log_pipeline_results(pipeline, X_train, y_train, X_val, y_val, run_name="SVC_Model_Balanced_class_weight", tags={"class_weight": "class_weight"})

In [None]:
log_pipeline_results(pipeline, X_train, y_train, X_val, y_val, run_name="SVC_Model_Balanced_class_weight", tags={"feature_selection": "SelectKBest"}})