In [2]:
import os
import json
import gzip
import numpy as np
from parameters import result_folder, all_assignees, num_assignees, projects, num_topics
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [5]:
optimal_num_topics = {}

print("--- Starting Analysis ---")

# First, check if the results folder actually exists.
if not os.path.isdir(result_folder):
    print(f"[ERROR] The result folder was not found at: '{result_folder}'")
    print("Please verify the path and try again.")
else:
    # Iterate through each project to analyze its results.
    for project in projects:
        # Use the full list of assignees for the FLINK project, otherwise use the default.
        assignee_counts = all_assignees if project == "FLINK" else [num_assignees]

        for n_assignees in assignee_counts:
            # Construct the identifier and file path for the current configuration.
            config_key = f"{project}_{n_assignees}_assignees"
            file_name = f"5_{config_key}_results.json.gz"
            file_path = os.path.join(result_folder, file_name)

            # Print a header for the current analysis task.
            print(f"\n=================================================")
            print(f"Processing Configuration: {config_key}")
            print(f"=================================================")

            if not os.path.exists(file_path):
                print(f"[Warning] Results file not found, skipping: {file_path}")
                continue

            try:
                # Open the gzipped JSON file and load its data.
                with gzip.open(file_path, 'rt', encoding='utf-8') as infile:
                    results = json.load(infile)

                y_classes = results.get("classes")
                y_test = results.get("y_test")

                if y_classes is None or y_test is None:
                    print(f"  [Error] 'classes' or 'y_test' data missing in {file_name}. Skipping.")
                    continue

                # Dictionary to hold accuracies for the current project.
                topic_accuracies = {}
                
                # Binarize the true labels once to avoid repeating it in the loop.
                label_binarizer = LabelBinarizer().fit(y_classes)
                y_true_binarized = label_binarizer.transform(y_test)

                # Loop through each topic number to calculate and print its accuracy.
                for n_topics in num_topics:
                    try:
                        # Extract the predictions for the specific SVM model.
                        y_pred = results[str(n_topics)]["SVM"]["topics"]["y_pred"]
                        
                        # Binarize the predicted labels for comparison.
                        y_pred_binarized = label_binarizer.transform(y_pred)
                        
                        # Calculate the accuracy and store it.
                        accuracy = accuracy_score(y_true_binarized, y_pred_binarized)
                        topic_accuracies[n_topics] = accuracy
                        
                        # Print the accuracy for the current topic number.
                        print(f"  - Topics: {n_topics:<3} | Accuracy: {accuracy:.4f}")

                    except KeyError:
                        print(f"  - Topics: {n_topics:<3} | [Data not found]")
                        topic_accuracies[n_topics] = 0.0 # Treat missing data as zero accuracy.

                # After checking all topics, find the optimal one.
                if not topic_accuracies:
                    print("[Warning] No accuracies were calculated. Cannot determine optimum.")
                    continue

                # Find the topic number that corresponds to the highest accuracy.
                all_topics = list(topic_accuracies.keys())
                all_accs = list(topic_accuracies.values())
                
                best_topic_index = np.argmax(all_accs)
                optimal_topic = all_topics[best_topic_index]
                optimal_accuracy = all_accs[best_topic_index]

                # *** MODIFIED LINE ***
                # Store a dictionary with both the topic count and its accuracy.
                optimal_num_topics[config_key] = {
                    "optimal_topics": optimal_topic,
                    "accuracy": optimal_accuracy
                }
                
                print("-------------------------------------------------")
                print(f"Optimal for {config_key}: {optimal_topic} topics (Accuracy: {optimal_accuracy:.4f})")
                print("-------------------------------------------------")

            except Exception as e:
                print(f"[ERROR] An unexpected error occurred while processing {file_name}: {e}")

    # --- Save the Final Summary -----------------------------------------------
    output_filename = os.path.join(result_folder, "6_optimal_num_topics.json")
    try:
        with open(output_filename, 'w', encoding='utf-8') as outfile:
            json.dump(optimal_num_topics, outfile, indent=4)
        print(f"\nAnalysis complete. Summary of optimal parameters saved to:\n{output_filename}")
    except Exception as e:
        print(f"\n[ERROR] Could not write the summary output file: {e}")

--- Starting Analysis ---

Processing Configuration: AMBARI_5_assignees
  - Topics: 4   | Accuracy: 0.3250
  - Topics: 6   | Accuracy: 0.3750
  - Topics: 8   | Accuracy: 0.3375
  - Topics: 10  | Accuracy: 0.3875
  - Topics: 12  | Accuracy: 0.3375
  - Topics: 14  | Accuracy: 0.3375
  - Topics: 16  | Accuracy: 0.3125
  - Topics: 18  | Accuracy: 0.3375
  - Topics: 20  | Accuracy: 0.3750
  - Topics: 25  | Accuracy: 0.3250
  - Topics: 30  | Accuracy: 0.3375
  - Topics: 40  | Accuracy: 0.4375
  - Topics: 50  | Accuracy: 0.4875
  - Topics: 60  | Accuracy: 0.4125
  - Topics: 70  | Accuracy: 0.4250
  - Topics: 80  | Accuracy: 0.4250
  - Topics: 90  | Accuracy: 0.4500
  - Topics: 100 | Accuracy: 0.3750
  - Topics: 110 | Accuracy: 0.4375
  - Topics: 120 | Accuracy: 0.4125
  - Topics: 130 | Accuracy: 0.3750
  - Topics: 140 | Accuracy: 0.4625
  - Topics: 150 | Accuracy: 0.3875
  - Topics: 160 | Accuracy: 0.3875
  - Topics: 170 | Accuracy: 0.4750
-------------------------------------------------
Opt