In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_6_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_10_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_130_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_12_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_25_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_20_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_60_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_distribution_8_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_60_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_train_distribution_25_topics.csv
/kaggle/input/task-assigning-data/4_OAK_5_assignees_features/X_test_50_topics.csv
/kaggle/input/task-assignin

In [12]:
# CELL 1: PARAMETERS

datafolder = "/kaggle/input/task-assigning-data/"  # Kaggle-absolute path to your uploaded data
resultsdatafolder = "/kaggle/working/results/"            # Will save outputs here in Kaggle work directory

num_assignees = 5
all_assignees = [5, 10, 15, 20]
num_topics = [4, 6, 8, 10, 12, 14, 16, 18, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170]
min_assignees = 5
min_issues_per_assignee = 80

In [13]:
import os
print(os.listdir('/kaggle/input'))

['task-assigning-data']


In [14]:
# CELL 2: OPTIONAL EXTRAS
# Kaggle usually has pandas, numpy, sklearn, etc. pre-installed.
!pip install seaborn  # Only if you get import errors for seaborn



In [18]:
# Step 1: Import required libraries
import os
import gzip
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import re

# Step 2: TF-IDF transformation function
def transform_with_tfidf(train_texts, test_texts):
    vectorizer = CountVectorizer(max_features=1000, min_df=5)
    train_vectors = vectorizer.fit_transform(train_texts.astype('U')).toarray()
    test_vectors = vectorizer.transform(test_texts.astype('U')).toarray()
    tfidf = TfidfTransformer()
    return tfidf.fit_transform(train_vectors).toarray(), tfidf.transform(test_vectors).toarray()

# Step 3: SVM classifier function
def classify_with_svm(X_train, y_train, X_test):
    classifier = CalibratedClassifierCV(LinearSVC())
    classifier.fit(X_train, y_train)
    return classifier.predict(X_test).tolist(), classifier.predict_proba(X_test).tolist(), classifier.classes_.tolist()

# Step 4: Naive Bayes classifier function
def classify_with_naive_bayes(X_train, y_train, X_test):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    return classifier.predict(X_test).tolist(), classifier.predict_proba(X_test).tolist(), classifier.classes_.tolist()

# Step 5: Combine classifier predictions
def weighted_vote(weights, prob_lists, class_labels):
    weighted_probs = sum(w * np.array(p) for w, p in zip(weights, prob_lists)) / sum(weights)
    max_indices = np.argmax(weighted_probs, axis=1)
    predictions = [class_labels[idx] for idx in max_indices]
    return predictions, weighted_probs.tolist()

# Step 6: Main processing block
print("Kaggle input data folder:", datafolder)
os.makedirs(resultsdatafolder, exist_ok=True)

for folder_name in os.listdir(datafolder):
    if folder_name.startswith("4") and os.path.isdir(os.path.join(datafolder, folder_name)):
        parts = folder_name.split("_")
        if len(parts) < 3:
            print(f"Unexpected folder name structure: {folder_name}")
            continue

        project, assignee_count = parts[1], parts[2]
        proj_path = os.path.join(datafolder, folder_name)
        output_path = os.path.join(resultsdatafolder, f"5_{project}_{assignee_count}_assignees_results.json.gz")

        with gzip.open(output_path, 'w') as gzfile:
            output_data = {}
            train_labels = pd.read_csv(os.path.join(proj_path, "y_train.csv"), sep='\t').dropna(axis=1, how='all')
            test_labels = pd.read_csv(os.path.join(proj_path, "y_test.csv"), sep='\t').dropna(axis=1, how='all')
            y_train, y_test = train_labels['assignee_id'], test_labels['assignee_id']

            for topic_num in num_topics:
                output_data[topic_num] = {}
                # Load all training/testing features
                X_train = pd.read_csv(os.path.join(proj_path, f"X_train_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                X_train_dist = pd.read_csv(os.path.join(proj_path, f"X_train_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                X_test = pd.read_csv(os.path.join(proj_path, f"X_test_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')
                X_test_dist = pd.read_csv(os.path.join(proj_path, f"X_test_distribution_{topic_num}_topics.csv"), sep='\t').dropna(axis=1, how='all')

                for col in ['labels', 'top_terms']:
                    for df in [X_train, X_test]:
                        df[col] = df.get(col, pd.Series([''] * len(df))).astype(str).str.replace('[\$\'\,]', '', regex=True)

                X_train['labels_top_terms'] = X_train['labels'] + ' ' + X_train['top_terms']
                X_test['labels_top_terms'] = X_test['labels'] + ' ' + X_test['top_terms']

                # Apply TF-IDF transformations
                train_title, test_title = transform_with_tfidf(X_train['processed_title'], X_test['processed_title'])
                train_desc, test_desc = transform_with_tfidf(X_train['processed_description'], X_test['processed_description'])
                train_labels_txt, test_labels_txt = transform_with_tfidf(X_train['processed_labels'], X_test['processed_labels'])
                train_terms, test_terms = transform_with_tfidf(X_train['top_terms'], X_test['top_terms'])
                train_combined, test_combined = transform_with_tfidf(X_train['labels_top_terms'], X_test['labels_top_terms'])

                train_topics, test_topics = X_train_dist.values, X_test_dist.values
                train_misc, test_misc = X_train[['priority_id', 'type_id']].values, X_test[['priority_id', 'type_id']].values

                for name, model_fn in [("SVM", classify_with_svm), ("NaiveBayes", classify_with_naive_bayes)]:
                    result_dict = {}
                    for key, train_set, test_set in zip(
                        ["title", "description", "labels", "top_terms", "labels_top_terms", "topics", "other"],
                        [train_title, train_desc, train_labels_txt, train_terms, train_combined, train_topics, train_misc],
                        [test_title, test_desc, test_labels_txt, test_terms, test_combined, test_topics, test_misc],
                    ):
                        preds, probs, class_list = model_fn(train_set, y_train, test_set)
                        result_dict[key] = {"y_pred": preds, "y_pred_proba": probs}

                    # Combine some predictions
                    prob_groups = [result_dict[key]['y_pred_proba'] for key in result_dict]
                    w = [0.6, 0.7, 0.5, 0.5, 0.5, 0.5, 0.1]
                    group_keys = {
                        "title_description": ([0, 1], [w[0], w[1]]),
                        "title_description_labels": ([0, 1, 2], w[:3]),
                        "title_description_labels_top_terms": ([0, 1, 4], [w[0], w[1], w[4]]),
                        "title_description_topics": ([0, 1, 5], [w[0], w[1], w[5]]),
                        "all": (list(range(7)), w)
                    }
                    for group, (idxs, ws) in group_keys.items():
                        combined_pred, combined_probs = weighted_vote([ws[i] for i in range(len(idxs))], [prob_groups[i] for i in idxs], class_list)
                        result_dict[group] = {"y_pred": combined_pred, "y_pred_proba": combined_probs}

                    output_data[topic_num][name] = result_dict
            # Save to gzip
            gzfile.write(json.dumps(output_data).encode('utf-8'))

Kaggle input data folder: /kaggle/input/task-assigning-data/




KeyboardInterrupt: 