In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_6_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_10_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_130_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_12_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_25_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_20_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_60_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_distribution_8_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_60_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_25_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_50_topics.csv
/kaggle/input/task-assignin

In [2]:
# CELL 1: PARAMETERS

datafolder = "/kaggle/input/task-assigning-data/"  # Kaggle-absolute path to your uploaded data
resultsdatafolder = "/kaggle/working/results/"            # Will save outputs here in Kaggle work directory

num_assignees = 5
all_assignees = [5, 10, 15, 20]
num_topics = [4, 6, 8, 10, 12, 14, 16, 18, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170]
min_assignees = 5
min_issues_per_assignee = 80

In [3]:
import os
print(os.listdir('/kaggle/input'))

['task-assigning-data']


In [4]:
# CELL 2: OPTIONAL EXTRAS
# Kaggle usually has pandas, numpy, sklearn, etc. pre-installed.
!pip install seaborn  # Only if you get import errors for seaborn



In [5]:
# Step 1: Import required libraries
import os
import gzip
import json
import numpy as np
import cudf  # Use cuDF for GPU DataFrames
import cupy as cp  # Import cupy to handle GPU arrays
from cuml.feature_extraction.text import CountVectorizer, TfidfTransformer # Use cuML for TF-IDF
from cuml.model_selection import train_test_split
from cuml.naive_bayes import MultinomialNB # Use cuML Naive Bayes
from cuml.svm import LinearSVC # Use LinearSVC as recommended
from sklearn.calibration import CalibratedClassifierCV
import re
import gc

# GPU memory management function
def clear_gpu_memory():
    """Clear GPU memory and garbage collect"""
    gc.collect()
    cp.get_default_memory_pool().free_all_blocks()

def print_gpu_memory():
    """Print current GPU memory usage."""
    mempool = cp.get_default_memory_pool()
    print(f"GPU Memory used: {mempool.used_bytes() / 1024**2:.2f} MB")
    print(f"GPU Memory total: {mempool.total_bytes() / 1024**2:.2f} MB")

# Step 2: TF-IDF transformation function (updated with reduced dimensions)
def transform_with_tfidf(train_texts, test_texts):
    """
    Performs TF-IDF transformation using cuML with reduced features.
    Converts to dense arrays for downstream SVM usage.
    """
    vectorizer = CountVectorizer(max_features=500, min_df=10)
    train_vectors = vectorizer.fit_transform(train_texts)
    test_vectors = vectorizer.transform(test_texts)

    tfidf = TfidfTransformer()
    train_tfidf = tfidf.fit_transform(train_vectors)
    test_tfidf = tfidf.transform(test_vectors)

    # Convert to dense and float32
    train_dense = train_tfidf.toarray().astype('float32')
    test_dense = test_tfidf.toarray().astype('float32')

    clear_gpu_memory()

    return train_dense, test_dense


# Step 3: SVM classifier function (updated to use LinearSVC and process in batches)
def classify_with_svm(X_train, y_train, X_test, batch_size=1000):
    try:
        classifier = LinearSVC(max_iter=1000)
        classifier.fit(X_train, y_train)

        # Calibrate on CPU using NumPy arrays
        X_train_cpu = X_train if isinstance(X_train, np.ndarray) else cp.asnumpy(X_train)
        y_train_cpu = y_train if isinstance(y_train, np.ndarray) else cp.asnumpy(y_train)

        calibrated_clf = CalibratedClassifierCV(LinearSVC(max_iter=1000), cv=3)
        calibrated_clf.fit(X_train_cpu, y_train_cpu)

        X_test_cpu = X_test if isinstance(X_test, np.ndarray) else cp.asnumpy(X_test)
        n_samples = X_test_cpu.shape[0]

        predictions = []
        probabilities = []

        for i in range(0, n_samples, batch_size):
            batch = X_test_cpu[i:min(i + batch_size, n_samples)]
            batch_pred = calibrated_clf.predict(batch)
            batch_proba = calibrated_clf.predict_proba(batch)
            predictions.extend(batch_pred.tolist())
            probabilities.extend(batch_proba.tolist())

    except Exception as e:
        print(f"Error in SVM classification: {e}")
        raise
    finally:
        clear_gpu_memory()

    return predictions, probabilities, calibrated_clf.classes_.tolist()

# Step 4: Naive Bayes classifier function (updated with batch processing)
def classify_with_naive_bayes(X_train, y_train, X_test, batch_size=1000):
    """
    Classifies data using cuML's MultinomialNB with batch processing.
    """
    try:
        classifier = MultinomialNB()
        
        # Ensure float32 type
        X_train_float = X_train.astype('float32')
        classifier.fit(X_train_float, y_train)
        
        # Get classes
        classes = classifier.classes_.tolist()
        
        # Clear GPU memory after training
        clear_gpu_memory()
        
        # Process predictions in batches
        X_test_float = X_test.astype('float32')
        n_samples = X_test_float.shape[0]
        
        predictions = []
        probabilities = []
        
        for i in range(0, n_samples, batch_size):
            batch_end = min(i + batch_size, n_samples)
            batch = X_test_float[i:batch_end]
            
            batch_pred = classifier.predict(batch).tolist()
            batch_proba = classifier.predict_proba(batch).tolist()
            
            predictions.extend(batch_pred)
            probabilities.extend(batch_proba)
            
            # Clear memory after each batch
            if i % (batch_size * 5) == 0:  # Clear every 5 batches
                clear_gpu_memory()
        
    except Exception as e:
        print(f"Error in Naive Bayes classification: {e}")
        raise
    finally:
        clear_gpu_memory()
    
    return predictions, probabilities, classes

# Step 5: Combine classifier predictions (remains the same)
def weighted_vote(weights, prob_lists, class_labels):
    """
    Combines predictions using a weighted vote.
    """
    prob_arrays = [np.array(p) for p in prob_lists]
    weighted_probs = sum(w * p for w, p in zip(weights, prob_arrays)) / sum(weights)
    max_indices = np.argmax(weighted_probs, axis=1)
    predictions = [class_labels[idx] for idx in max_indices]
    
    return predictions, weighted_probs.tolist()

# Step 6: Main processing block
print("Kaggle input data folder:", datafolder)
os.makedirs(resultsdatafolder, exist_ok=True)

if not os.path.exists(datafolder):
    print(f"Error: Data folder not found at {datafolder}")
else:
    # Get all folders to process
    folders_to_process = [f for f in os.listdir(datafolder) 
                         if f.startswith("4") and os.path.isdir(os.path.join(datafolder, f))]
    
    # Process folders one at a time to manage memory
    for folder_idx, folder_name in enumerate(folders_to_process):
        print(f"\nProcessing folder {folder_idx + 1}/{len(folders_to_process)}: {folder_name}")
        print_gpu_memory()
        
        parts = folder_name.split("_")
        if len(parts) < 3:
            print(f"Unexpected folder name structure: {folder_name}")
            continue

        project, assignee_count = parts[1], parts[2]
        proj_path = os.path.join(datafolder, folder_name)
        output_path = os.path.join(resultsdatafolder, f"5_{project}_{assignee_count}_assignees_results.json.gz")

        try:
            with gzip.open(output_path, 'wt', encoding='utf-8') as gzfile:
                output_data = {}
                
                # Load labels
                train_labels = cudf.read_csv(os.path.join(proj_path, "y_train.csv"), sep='\t').dropna(axis=1, how='all')
                test_labels = cudf.read_csv(os.path.join(proj_path, "y_test.csv"), sep='\t').dropna(axis=1, how='all')
                y_train, y_test = train_labels['assignee_id'], test_labels['assignee_id']
                
                # Save y_test for later
                output_data["y_test"] = y_test.to_pandas().tolist()

                # Process each topic number
                for topic_idx, topic_num in enumerate(num_topics):
                    print(f"\n  Processing topic {topic_idx + 1}/{len(num_topics)}: {topic_num} topics")
                    print_gpu_memory()
                    
                    output_data[topic_num] = {}
                    
                    try:
                        # Load data
                        X_train = cudf.read_csv(os.path.join(proj_path, f"X_train_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                        X_train_dist = cudf.read_csv(os.path.join(proj_path, f"X_train_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                        X_test = cudf.read_csv(os.path.join(proj_path, f"X_test_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                        X_test_dist = cudf.read_csv(os.path.join(proj_path, f"X_test_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')

                        # Process text columns (fix the regex issue)
                        for col in ['labels', 'top_terms']:
                            for df in [X_train, X_test]:
                                if col in df.columns:
                                    # Fixed regex pattern - remove brackets, quotes, and commas
                                    df[col] = df[col].astype(str).str.replace(r'[\[\]\',]', '', regex=True)
                                else:
                                    df[col] = ''

                        X_train['labels_top_terms'] = X_train['labels'] + ' ' + X_train['top_terms']
                        X_test['labels_top_terms'] = X_test['labels'] + ' ' + X_test['top_terms']

                        # Clear memory before feature extraction
                        clear_gpu_memory()

                        # Extract features
                        print("    Extracting features...")
                        feature_sets = {}
                        
                        # Process each feature type
                        feature_types = [
                            ('title', 'processed_title'),
                            ('description', 'processed_description'),
                            ('labels', 'processed_labels'),
                            ('top_terms', 'top_terms'),
                            ('labels_top_terms', 'labels_top_terms')
                        ]
                        
                        for feature_name, column_name in feature_types:
                            if column_name in X_train.columns and column_name in X_test.columns:
                                train_feat, test_feat = transform_with_tfidf(X_train[column_name], X_test[column_name])
                                feature_sets[feature_name] = (train_feat, test_feat)
                                clear_gpu_memory()  # Clear after each feature extraction
                        
                        # Add non-text features
                        feature_sets['topics'] = (X_train_dist.values, X_test_dist.values)
                        feature_sets['other'] = (X_train[['priority_id', 'type_id']].values, X_test[['priority_id', 'type_id']].values)
                        
                        # Convert all features to float32
                        for key in feature_sets:
                            train_set, test_set = feature_sets[key]
                            if cp.sparse.issparse(train_set):
                                feature_sets[key] = (train_set.astype('float32'), test_set.astype('float32'))
                            else:
                                feature_sets[key] = (cp.asarray(train_set).astype('float32'), cp.asarray(test_set).astype('float32'))

                        # Process each classifier
                        for classifier_name, classifier_function in [("SVM", classify_with_svm), ("NaiveBayes", classify_with_naive_bayes)]:
                            print(f"    Training {classifier_name} for {project} with {assignee_count} assignees and {topic_num} topics")
                            output_data[topic_num][classifier_name] = {}
                            
                            # Store classes from first feature set
                            classes = None
                            
                            # Process each feature set
                            for feature_name, (train_set, test_set) in feature_sets.items():
                                print(f"      Processing feature: {feature_name}")
                                try:
                                    preds, probs, class_list = classifier_function(train_set, y_train, test_set)
                                    output_data[topic_num][classifier_name][feature_name] = {
                                        "y_pred": preds, 
                                        "y_pred_proba": probs
                                    }
                                    
                                    if classes is None:
                                        classes = class_list
                                        if "classes" not in output_data:
                                            output_data["classes"] = classes
                                    
                                    # Clear GPU memory after each feature classification
                                    clear_gpu_memory()
                                    
                                except Exception as e:
                                    print(f"      Error processing {feature_name}: {e}")
                                    # Continue with next feature
                                    continue
                            
                            # Create ensemble predictions if we have results
                            if classes is not None:
                                try:
                                    # Get probability lists for ensemble
                                    prob_groups = []
                                    feature_order = ["title", "description", "labels", "top_terms", "labels_top_terms", "topics", "other"]
                                    
                                    for feat in feature_order:
                                        if feat in output_data[topic_num][classifier_name]:
                                            prob_groups.append(output_data[topic_num][classifier_name][feat]["y_pred_proba"])
                                        else:
                                            # If feature is missing, skip ensemble that includes it
                                            prob_groups.append(None)
                                    
                                    # Define weights
                                    w = [0.6, 0.7, 0.5, 0.5, 0.5, 0.5, 0.1]
                                    
                                    # Create ensemble combinations
                                    ensemble_configs = {
                                        "title_description": ([0, 1], [w[0], w[1]]),
                                        "title_description_labels": ([0, 1, 2], w[:3]),
                                        "title_description_labels_top_terms": ([0, 1, 4], [w[0], w[1], w[4]]),
                                        "title_description_topics": ([0, 1, 5], [w[0], w[1], w[5]]),
                                        "all": ([i for i in range(7) if prob_groups[i] is not None], 
                                               [w[i] for i in range(7) if prob_groups[i] is not None])
                                    }
                                    
                                    for ensemble_name, (indices, weights) in ensemble_configs.items():
                                        # Check if all required probabilities exist
                                        probs_for_ensemble = [prob_groups[i] for i in indices if prob_groups[i] is not None]
                                        weights_for_ensemble = weights[:len(probs_for_ensemble)]
                                        
                                        if len(probs_for_ensemble) == len(indices):
                                            combined_pred, combined_probs = weighted_vote(
                                                weights_for_ensemble, 
                                                probs_for_ensemble, 
                                                classes
                                            )
                                            output_data[topic_num][classifier_name][ensemble_name] = {
                                                "y_pred": combined_pred, 
                                                "y_pred_proba": combined_probs
                                            }
                                
                                except Exception as e:
                                    print(f"      Error creating ensemble predictions: {e}")
                            
                            # Clear memory after each classifier
                            clear_gpu_memory()
                        
                        # Clear memory after processing all classifiers for this topic
                        clear_gpu_memory()
                        
                    except Exception as e:
                        print(f"  Error processing topic {topic_num}: {e}")
                        continue
                
                # Write results to file
                gzfile.write(json.dumps(output_data, sort_keys=False))
                print(f"\nSaved results to: {output_path}")
                
        except Exception as e:
            print(f"Error processing folder {folder_name}: {e}")
            continue
        
        finally:
            # Clear all GPU memory after processing each folder
            clear_gpu_memory()
            print(f"Completed processing {folder_name}")
            print_gpu_memory()
            
            # Optional: Add a small delay to ensure resources are freed
            import time
            time.sleep(2)

print("\nAll processing complete!")

Kaggle input data folder: /kaggle/input/task-assigning-data/

Processing folder 1/19: 4_OAK_5_assignees_features
GPU Memory used: 0.00 MB
GPU Memory total: 0.00 MB

  Processing topic 1/25: 4 topics
GPU Memory used: 0.00 MB
GPU Memory total: 0.00 MB
    Extracting features...
    Training SVM for OAK with 5 assignees and 4 topics
      Processing feature: title
      Processing feature: description
      Processing feature: labels
      Processing feature: top_terms
      Processing feature: labels_top_terms
      Processing feature: topics
      Processing feature: other
    Training NaiveBayes for OAK with 5 assignees and 4 topics
      Processing feature: title
      Processing feature: description
      Processing feature: labels
      Processing feature: top_terms
      Processing feature: labels_top_terms
      Processing feature: topics
      Processing feature: other

  Processing topic 2/25: 6 topics
GPU Memory used: 0.00 MB
GPU Memory total: 0.00 MB
    Extracting features...

In [6]:
# # Step 1: Import required libraries
# import os
# import gzip
# import json
# import numpy as np
# import cudf  # Use cuDF for GPU DataFrames
# import cupy  # Import cupy to handle GPU arrays
# from cuml.feature_extraction.text import CountVectorizer, TfidfTransformer # Use cuML for TF-IDF
# from cuml.model_selection import train_test_split
# from cuml.naive_bayes import MultinomialNB # Use cuML Naive Bayes
# from cuml.svm import SVC # Use cuML SVM
# import re
# # Add this at the start of your script
# import cupy as cp
# from numba import cuda
# from cuml.svm import LinearSVC

# cp.get_default_memory_pool().free_all_blocks()  # Clear GPU memory cache
# # Assume these are defined in your Kaggle environment
# # Make sure to set these variables appropriately
# # datafolder = "/kaggle/input/task-assigning-data/"
# # resultsdatafolder = "/kaggle/working/results"
# # num_topics = [10, 20, 30] # Example topic numbers

# # Step 2: TF-IDF transformation function (updated for cuML)
# def transform_with_tfidf(train_texts, test_texts):
#     """
#     Performs TF-IDF transformation using cuML.
#     """
#     vectorizer = CountVectorizer(max_features=1000, min_df=5)
#     train_vectors = vectorizer.fit_transform(train_texts)
#     test_vectors = vectorizer.transform(test_texts)
    
#     tfidf = TfidfTransformer()
#     return tfidf.fit_transform(train_vectors), tfidf.transform(test_vectors)

# # Step 3: SVM classifier function (updated for cuML)
# def classify_with_svm(X_train, y_train, X_test):
#     """
#     Classifies data using cuML's SVC.
#     """
#     # classifier = SVC(kernel='linear', probability=True, max_iter=1000)
#     # classifier.fit(X_train, y_train)
#     classifier = LinearSVC()
# classifier.fit(X_train, y_train)
    
#     # .predict() returns a CuPy/NumPy array, call .tolist() directly.
#     predictions = classifier.predict(X_test).tolist()
#     probabilities = classifier.predict_proba(X_test).tolist()
#     classes = classifier.classes_.tolist()
    
#     return predictions, probabilities, classes

# # Step 4: Naive Bayes classifier function (updated for cuML)
# def classify_with_naive_bayes(X_train, y_train, X_test):
#     """
#     Classifies data using cuML's MultinomialNB.
#     """
#     classifier = MultinomialNB()
#     # Naive Bayes also prefers float types
#     X_train_float = X_train.astype('float32')
#     X_test_float = X_test.astype('float32')
#     classifier.fit(X_train_float, y_train)
    
#     # .predict() returns a CuPy/NumPy array, call .tolist() directly.
#     predictions = classifier.predict(X_test_float).tolist()
#     probabilities = classifier.predict_proba(X_test_float).tolist()
#     classes = classifier.classes_.tolist()
    
#     return predictions, probabilities, classes

# # Step 5: Combine classifier predictions (remains on CPU, but handles NumPy arrays)
# def weighted_vote(weights, prob_lists, class_labels):
#     """
#     Combines predictions using a weighted vote.
#     """
#     prob_arrays = [np.array(p) for p in prob_lists]
#     weighted_probs = sum(w * p for w, p in zip(weights, prob_arrays)) / sum(weights)
#     max_indices = np.argmax(weighted_probs, axis=1)
#     predictions = [class_labels[idx] for idx in max_indices]
    
#     return predictions, weighted_probs.tolist()

# # Step 6: Main processing block (updated for cuDF and dtype conversion)
# print("Kaggle input data folder:", datafolder)
# os.makedirs(resultsdatafolder, exist_ok=True)

# if not os.path.exists(datafolder):
#     print(f"Error: Data folder not found at {datafolder}")
# else:
#     for folder_name in os.listdir(datafolder):
#         if folder_name.startswith("4") and os.path.isdir(os.path.join(datafolder, folder_name)):
#             parts = folder_name.split("_")
#             if len(parts) < 3:
#                 print(f"Unexpected folder name structure: {folder_name}")
#                 continue

#             project, assignee_count = parts[1], parts[2]
#             proj_path = os.path.join(datafolder, folder_name)
#             output_path = os.path.join(resultsdatafolder, f"5_{project}_{assignee_count}_assignees_results.json.gz")

#             with gzip.open(output_path, 'wt', encoding='utf-8') as gzfile:
#                 output_data = {}
#                 train_labels = cudf.read_csv(os.path.join(proj_path, "y_train.csv"), sep='\t').dropna(axis=1, how='all')
#                 test_labels = cudf.read_csv(os.path.join(proj_path, "y_test.csv"), sep='\t').dropna(axis=1, how='all')
#                 y_train, y_test = train_labels['assignee_id'], test_labels['assignee_id']

#                 for topic_num in num_topics:
#                     output_data[topic_num] = {}
#                     X_train = cudf.read_csv(os.path.join(proj_path, f"X_train_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                     X_train_dist = cudf.read_csv(os.path.join(proj_path, f"X_train_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                     X_test = cudf.read_csv(os.path.join(proj_path, f"X_test_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                     X_test_dist = cudf.read_csv(os.path.join(proj_path, f"X_test_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')

#                     for col in ['labels', 'top_terms']:
#                         for df in [X_train, X_test]:
#                             if col in df.columns:
#                                df[col] = df[col].astype(str).str.replace('[\$\'\,]', '', regex=True)
#                             else:
#                                df[col] = ''

#                     X_train['labels_top_terms'] = X_train['labels'] + ' ' + X_train['top_terms']
#                     X_test['labels_top_terms'] = X_test['labels'] + ' ' + X_test['top_terms']

#                     train_title, test_title = transform_with_tfidf(X_train['processed_title'], X_test['processed_title'])
#                     train_desc, test_desc = transform_with_tfidf(X_train['processed_description'], X_test['processed_description'])
#                     train_labels_txt, test_labels_txt = transform_with_tfidf(X_train['processed_labels'], X_test['processed_labels'])
#                     train_terms, test_terms = transform_with_tfidf(X_train['top_terms'], X_test['top_terms'])
#                     train_combined, test_combined = transform_with_tfidf(X_train['labels_top_terms'], X_test['labels_top_terms'])

#                     train_topics, test_topics = X_train_dist.values, X_test_dist.values
#                     train_misc, test_misc = X_train[['priority_id', 'type_id']].values, X_test[['priority_id', 'type_id']].values
                    
#                     # Store features in a dictionary to easily iterate and convert types
#                     feature_sets = {
#                         "title": (train_title, test_title),
#                         "description": (train_desc, test_desc),
#                         "labels": (train_labels_txt, test_labels_txt),
#                         "top_terms": (train_terms, test_terms),
#                         "labels_top_terms": (train_combined, test_combined),
#                         "topics": (train_topics, test_topics),
#                         "other": (train_misc, test_misc)
#                     }
                    
#                     # FIX: Explicitly convert all feature arrays to float32
#                     for key in feature_sets:
#                         train_set, test_set = feature_sets[key]
#                         # Use cupy.sparse.csr_matrix for sparse inputs, otherwise regular astype
#                         if cupy.sparse.issparse(train_set):
#                             feature_sets[key] = (train_set.astype('float32'), test_set.astype('float32'))
#                         else:
#                             feature_sets[key] = (cupy.asarray(train_set).astype('float32'), cupy.asarray(test_set).astype('float32'))


#                     for name, model_fn in [("SVM", classify_with_svm), ("NaiveBayes", classify_with_naive_bayes)]:
#                         print(f"startign training model {name}: of {project} and {assignee_count} and num of topic {topic_num}")
#                         result_dict = {}
                        
#                         for key, (train_set, test_set) in feature_sets.items():
#                             preds, probs, class_list = model_fn(train_set, y_train, test_set)
#                             result_dict[key] = {"y_pred": preds, "y_pred_proba": probs}

#                         prob_groups = [result_dict[key]['y_pred_proba'] for key in feature_sets.keys()]
#                         w = [0.6, 0.7, 0.5, 0.5, 0.5, 0.5, 0.1]
#                         group_keys = {
#                             "title_description": ([0, 1], [w[0], w[1]]),
#                             "title_description_labels": ([0, 1, 2], w[:3]),
#                             "title_description_labels_top_terms": ([0, 1, 4], [w[0], w[1], w[4]]),
#                             "title_description_topics": ([0, 1, 5], [w[0], w[1], w[5]]),
#                             "all": (list(range(7)), w)
#                         }
#                         for group, (idxs, ws) in group_keys.items():
#                             combined_pred, combined_probs = weighted_vote(ws, [prob_groups[i] for i in idxs], class_list)
#                             result_dict[group] = {"y_pred": combined_pred, "y_pred_proba": combined_probs}
                        
#                         output_data[topic_num][name] = result_dict

#                 gzfile.write(json.dumps(output_data))
                
#     print("Processing complete.")

In [7]:
# # Step 1: Import required libraries
# import os
# import gzip
# import json
# import numpy as np
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.svm import LinearSVC
# from sklearn.calibration import CalibratedClassifierCV
# from scipy.sparse import csr_matrix

# import re

# def print_sparsity(matrix, name="TF-IDF"):
#     if isinstance(matrix, csr_matrix):
#         non_zeros = matrix.count_nonzero()
#         total_elements = matrix.shape[0] * matrix.shape[1]
#         sparsity = 1.0 - (non_zeros / total_elements)
#         print(f"{name} shape: {matrix.shape}, Non-zeros: {non_zeros}, Sparsity: {sparsity:.4f}")
#     else:
#         print(f"{name} is not a sparse matrix. Shape: {matrix.shape}")

# def transform_with_tfidf(train_texts, test_texts, name=""):
#     vectorizer = CountVectorizer(max_features=1000, min_df=5)
#     train_vectors = vectorizer.fit_transform(train_texts.astype('U'))
#     test_vectors = vectorizer.transform(test_texts.astype('U'))
#     tfidf = TfidfTransformer()
#     tfidf_train = tfidf.fit_transform(train_vectors)
#     tfidf_test = tfidf.transform(test_vectors)

#     print_sparsity(tfidf_train, f"TF-IDF {name} (train)")
#     print_sparsity(tfidf_test, f"TF-IDF {name} (test)")

#     return tfidf_train, tfidf_test

# # Step 3: SVM classifier function
# def classify_with_svm(X_train, y_train, X_test):
#     classifier = CalibratedClassifierCV(LinearSVC())
#     classifier.fit(X_train, y_train)
#     return classifier.predict(X_test).tolist(), classifier.predict_proba(X_test).tolist(), classifier.classes_.tolist()

# # Step 4: Naive Bayes classifier function
# def classify_with_naive_bayes(X_train, y_train, X_test):
#     classifier = MultinomialNB()
#     classifier.fit(X_train, y_train)
#     return classifier.predict(X_test).tolist(), classifier.predict_proba(X_test).tolist(), classifier.classes_.tolist()

# # Step 5: Combine classifier predictions
# def weighted_vote(weights, prob_lists, class_labels):
#     weighted_probs = sum(w * np.array(p) for w, p in zip(weights, prob_lists)) / sum(weights)
#     max_indices = np.argmax(weighted_probs, axis=1)
#     predictions = [class_labels[idx] for idx in max_indices]
#     return predictions, weighted_probs.tolist()

# # Step 6: Main processing block
# print("Kaggle input data folder:", datafolder)
# os.makedirs(resultsdatafolder, exist_ok=True)

# for folder_name in os.listdir(datafolder):
#     if folder_name.startswith("4") and os.path.isdir(os.path.join(datafolder, folder_name)):
#         parts = folder_name.split("_")
#         if len(parts) < 3:
#             print(f"Unexpected folder name structure: {folder_name}")
#             continue

#         project, assignee_count = parts[1], parts[2]
#         proj_path = os.path.join(datafolder, folder_name)
#         output_path = os.path.join(resultsdatafolder, f"5_{project}_{assignee_count}_assignees_results.json.gz")

#         with gzip.open(output_path, 'w') as gzfile:
#             output_data = {}
#             train_labels = pd.read_csv(os.path.join(proj_path, "y_train.csv"), sep='\t').dropna(axis=1, how='all')
#             test_labels = pd.read_csv(os.path.join(proj_path, "y_test.csv"), sep='\t').dropna(axis=1, how='all')
#             y_train, y_test = train_labels['assignee_id'], test_labels['assignee_id']

#             for topic_num in num_topics:
#                 output_data[topic_num] = {}
#                 # Load all training/testing features
#                 X_train = pd.read_csv(os.path.join(proj_path, f"X_train_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                 X_train_dist = pd.read_csv(os.path.join(proj_path, f"X_train_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                 X_test = pd.read_csv(os.path.join(proj_path, f"X_test_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                 X_test_dist = pd.read_csv(os.path.join(proj_path, f"X_test_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
#                 # print(X_train.columns.tolist())
#                 # print(X_train.head)
#                 for col in ['labels', 'top_terms']:
#                     for df in [X_train, X_test]:
#                         df[col] = df.get(col, pd.Series([''] * len(df))).astype(str).str.replace('[\$\'\,]', '', regex=True)
#                 # print(X_train.columns.tolist())

#                 X_train['labels_top_terms'] = X_train['labels'] + ' ' + X_train['top_terms']
#                 X_test['labels_top_terms'] = X_test['labels'] + ' ' + X_test['top_terms']

#                 # Apply TF-IDF transformations
#                 train_title, test_title = transform_with_tfidf(X_train['processed_title'], X_test['processed_title'])
#                 train_desc, test_desc = transform_with_tfidf(X_train['processed_description'], X_test['processed_description'])
#                 train_labels_txt, test_labels_txt = transform_with_tfidf(X_train['processed_labels'], X_test['processed_labels'])
#                 train_terms, test_terms = transform_with_tfidf(X_train['top_terms'], X_test['top_terms'])
#                 train_combined, test_combined = transform_with_tfidf(X_train['labels_top_terms'], X_test['labels_top_terms'])

#                 train_topics, test_topics = X_train_dist.values, X_test_dist.values
#                 train_misc, test_misc = X_train[['priority_id', 'type_id']].values, X_test[['priority_id', 'type_id']].values

#                 for name, model_fn in [("SVM", classify_with_svm), ("NaiveBayes", classify_with_naive_bayes)]:
#                     result_dict = {}
#                     for key, train_set, test_set in zip(
#                         ["title", "description", "labels", "top_terms", "labels_top_terms", "topics", "other"],
#                         [train_title, train_desc, train_labels_txt, train_terms, train_combined, train_topics, train_misc],
#                         [test_title, test_desc, test_labels_txt, test_terms, test_combined, test_topics, test_misc],
#                     ):
#                         preds, probs, class_list = model_fn(train_set, y_train, test_set)
#                         result_dict[key] = {"y_pred": preds, "y_pred_proba": probs}

#                     # Combine some predictions
#                     prob_groups = [result_dict[key]['y_pred_proba'] for key in result_dict]
#                     w = [0.6, 0.7, 0.5, 0.5, 0.5, 0.5, 0.1]
#                     group_keys = {
#                         "title_description": ([0, 1], [w[0], w[1]]),
#                         "title_description_labels": ([0, 1, 2], w[:3]),
#                         "title_description_labels_top_terms": ([0, 1, 4], [w[0], w[1], w[4]]),
#                         "title_description_topics": ([0, 1, 5], [w[0], w[1], w[5]]),
#                         "all": (list(range(7)), w)
#                     }
#                     for group, (idxs, ws) in group_keys.items():
#                         combined_pred, combined_probs = weighted_vote([ws[i] for i in range(len(idxs))], [prob_groups[i] for i in idxs], class_list)
#                         result_dict[group] = {"y_pred": combined_pred, "y_pred_proba": combined_probs}

#                     output_data[topic_num][name] = result_dict
#             # Save to gzip

#             gzfile.write(json.dumps(output_data).encode('utf-8'))