In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
try:
    from parameters import data_out_folder, min_assignees, min_issues_per_assignee, num_assignees, all_assignees
    print("Successfully imported configuration from parameters.py")
    print(f"Data folder: {data_out_folder}")
    # It's good practice to ensure data_out_folder exists early
    if not os.path.isdir(data_out_folder):
        print(f"Error: The data_out_folder '{data_out_folder}' specified in properties.py does not exist.")
        exit()
except ImportError:
    print("Error: Could not import 'properties.py'. Make sure it exists in the same directory and contains the necessary variables.")
    exit()
except AttributeError as e:
    print(f"Error: A required variable is missing from 'properties.py'. Ensure all are defined: {e}")
    exit()

Successfully imported configuration from parameters.py
Data folder: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output


In [3]:
input_csv_path = os.path.join(data_out_folder, "issues_limited_user_cols.csv")
print(f"Reading data from: {input_csv_path}")
try:
    mongo_df = pd.read_csv(input_csv_path, sep='\t', encoding='utf-8')
except FileNotFoundError:
    print(f"Error: The input file '{input_csv_path}' was not found. Please ensure it exists.")
    exit()
mongo_df.drop(mongo_df.columns[mongo_df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True, errors='ignore')


Reading data from: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\issues_limited_user_cols.csv


In [4]:
if 'assignee' in mongo_df.columns:
    mongo_df['assignee_id'] = mongo_df['assignee'].rank(method='dense').astype(int)
else:
    print("Error: 'assignee' column not found in the CSV. Cannot create 'assignee_id'.")
    exit()

In [5]:
rename_map = {
    '_id': 'id',
    'summary': 'title',
    'projectname': 'project_name'
    # Add other renames if necessary, ensure original columns exist
}
# Check if columns to be renamed actually exist before renaming
existing_columns_to_rename = {k: v for k, v in rename_map.items() if k in mongo_df.columns}
mongo_df.rename(columns=existing_columns_to_rename, inplace=True)


# Rearrange columns for better presentation of the data
# Ensure all these columns exist after renaming or were originally present
desired_columns_order = ['id', 'title', 'description', 'project_name', 'status_name', 
                         'priority_id', 'type_id', 'assignee_id', 'labels']
# Filter to keep only existing desired columns and reorder
actual_columns_to_use = [col for col in desired_columns_order if col in mongo_df.columns]
if len(actual_columns_to_use) != len(desired_columns_order):
    print(f"Warning: Not all desired columns for reordering were found. Using: {actual_columns_to_use}")
mongo_df = mongo_df[actual_columns_to_use]

In [6]:
print("Filtering issues with no titles or descriptions...")
# Check if 'title' and 'description' columns exist
if 'title' in mongo_df.columns and 'description' in mongo_df.columns:
    mongo_filtered_df = mongo_df[(mongo_df['description'].notna()) & (mongo_df['title'].notna())].copy() # Use .copy() to avoid SettingWithCopyWarning
else:
    print("Warning: 'title' or 'description' column not found. Skipping this filtering step.")
    mongo_filtered_df = mongo_df.copy()
print(f"Number of issues after initial text filtering: {len(mongo_filtered_df)}")


Filtering issues with no titles or descriptions...
Number of issues after initial text filtering: 128853


In [7]:
print("Filtering projects based on assignee activity...")
projects_enough_issues_per_assignee = []
# Check if 'project_name' exists
if 'project_name' not in mongo_filtered_df.columns:
    print("Error: 'project_name' column not found. Cannot filter projects.")
    exit()
if 'assignee_id' not in mongo_filtered_df.columns: # Should exist from step 2
    print("Error: 'assignee_id' column not found. Cannot filter projects by assignee activity.")
    exit()

mongo_projects = mongo_filtered_df['project_name'].unique()

for project in mongo_projects:
    issues_single_project = mongo_filtered_df[mongo_filtered_df['project_name'] == project].copy()
    
    # Identify assignees with fewer than 'min_issues_per_assignee'
    assignee_counts = issues_single_project['assignee_id'].value_counts()
    removed_assignee_ids = assignee_counts[assignee_counts < min_issues_per_assignee].index.tolist()
    
    # Remove issues from these less active assignees
    issues_single_project = issues_single_project[~issues_single_project['assignee_id'].isin(removed_assignee_ids)]
    
    # Check if the project still has enough unique assignees
    if issues_single_project['assignee_id'].nunique() >= min_assignees:
        projects_enough_issues_per_assignee.append(project)
        print(f"  Project '{project}' QUALIFIED with {issues_single_project['assignee_id'].nunique()} active assignees.")
    else:
        print(f"  Project '{project}' NOT qualified. Assignees remaining: {issues_single_project['assignee_id'].nunique()}")

print(f"\nFound {len(projects_enough_issues_per_assignee)} projects with at least {min_assignees} assignees, each having at least {min_issues_per_assignee} issues:")
for p_name in projects_enough_issues_per_assignee:
    print(f"- {p_name}")


Filtering projects based on assignee activity...
  Project 'AXIS' NOT qualified. Assignees remaining: 0
  Project 'DERBY' NOT qualified. Assignees remaining: 0
  Project 'XERCESJ' NOT qualified. Assignees remaining: 0
  Project 'NUTCH' NOT qualified. Assignees remaining: 0
  Project 'WODEN' NOT qualified. Assignees remaining: 0
  Project 'OFBIZ' NOT qualified. Assignees remaining: 4
  Project 'QPID' NOT qualified. Assignees remaining: 1
  Project 'MAPREDUCE' NOT qualified. Assignees remaining: 0
  Project 'CXF' NOT qualified. Assignees remaining: 2
  Project 'AXIS2' NOT qualified. Assignees remaining: 0
  Project 'ODE' NOT qualified. Assignees remaining: 0
  Project 'IVY' NOT qualified. Assignees remaining: 0
  Project 'HDFS' NOT qualified. Assignees remaining: 2
  Project 'UIMA' NOT qualified. Assignees remaining: 0
  Project 'HADOOP' NOT qualified. Assignees remaining: 3
  Project 'TIKA' NOT qualified. Assignees remaining: 0
  Project 'HBASE' NOT qualified. Assignees remaining: 0
  P

In [8]:
print("\nProcessing and exporting data for qualified projects...")
for project_name in projects_enough_issues_per_assignee:
    print(f"\nProcessing project: {project_name}")
    
    # Determine the list of n_assignee values to iterate through
    n_assignees_list_for_project = all_assignees if project_name == "FLINK" else [num_assignees]
    
    for n_assignees_to_consider in n_assignees_list_for_project:
        print(f"  Considering top {n_assignees_to_consider} assignees...")
        
        # Create a DataFrame which contains the issues of this project (already filtered by active assignees in step 4 implicitly)
        # We should re-filter mongo_filtered_df to ensure we start with issues having titles/descriptions
        # then apply the assignee activity filter again for clarity or use the list of qualified assignees
        # For simplicity here, we'll take the original filtered issues and then select assignees.
        
        temp_project_df_all_issues = mongo_filtered_df[mongo_filtered_df['project_name'] == project_name].copy()

        # If we already filtered assignees in step 4, we should work with that set of assignees.
        # Re-calculate active assignees for this project (those meeting min_issues_per_assignee)
        assignee_counts_in_project = temp_project_df_all_issues['assignee_id'].value_counts()
        active_assignees_in_project = assignee_counts_in_project[assignee_counts_in_project >= min_issues_per_assignee].index.tolist()
        
        # Now, from these *active* assignees, select the top 'n_assignees_to_consider'
        # based on their issue counts within this active set.
        issues_from_active_assignees = temp_project_df_all_issues[temp_project_df_all_issues['assignee_id'].isin(active_assignees_in_project)]
        
        if issues_from_active_assignees.empty:
            print(f"    No issues found for active assignees in {project_name}. Skipping for {n_assignees_to_consider} assignees.")
            continue

        top_n_assignees_ids = issues_from_active_assignees['assignee_id'].value_counts().nlargest(n_assignees_to_consider).index.tolist()

        if not top_n_assignees_ids:
            print(f"    Could not select top {n_assignees_to_consider} assignees for {project_name} (perhaps not enough active assignees). Skipping.")
            continue
            
        print(f"    Selected {len(top_n_assignees_ids)} top assignees for this iteration.")

        # Keep only issues that have an id present in the top_n_assignees_ids list
        df_for_project_and_n_assignees = issues_from_active_assignees[issues_from_active_assignees['assignee_id'].isin(top_n_assignees_ids)].copy()

        if df_for_project_and_n_assignees.empty:
            print(f"    No issues after filtering for top {len(top_n_assignees_ids)} assignees in {project_name}. Skipping.")
            continue

        # Undersample the data
        # For assignees with 80 or more issues, sample 80. Keep all issues for others.
        # The constant '80' is hardcoded here as in your script.
        undersample_threshold = 80
        assignee_group_sizes = df_for_project_and_n_assignees.groupby('assignee_id')['assignee_id'].transform('size')
        
        msk_ge_threshold = assignee_group_sizes >= undersample_threshold
        
        df_undersampled_part = df_for_project_and_n_assignees[msk_ge_threshold].groupby('assignee_id', group_keys=False).sample(n=undersample_threshold, random_state=1) # Added random_state for reproducibility
        df_keep_all_part = df_for_project_and_n_assignees[~msk_ge_threshold]
        
        final_project_df = pd.concat([df_undersampled_part, df_keep_all_part], ignore_index=True)
        print(f"    Size of dataset for {project_name} with {n_assignees_to_consider} assignees after undersampling: {len(final_project_df)} issues.")

        # Write data to file
        output_filename = f"2_{project_name}_{n_assignees_to_consider}_assignees.csv"
        output_path = os.path.join(data_out_folder, output_filename)
        
        print(f"    Writing data to: {output_path}")
        try:
            final_project_df.to_csv(output_path, sep='\t', encoding='utf-8', index=False)
            print(f"    Successfully saved: {output_filename}")
        except Exception as e:
            print(f"    Error writing CSV file {output_filename}: {e}")

print("\nScript finished.")


Processing and exporting data for qualified projects...

Processing project: HIVE
  Considering top 5 assignees...
    Selected 5 top assignees for this iteration.
    Size of dataset for HIVE with 5 assignees after undersampling: 400 issues.
    Writing data to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\2_HIVE_5_assignees.csv
    Successfully saved: 2_HIVE_5_assignees.csv

Processing project: CASSANDRA
  Considering top 5 assignees...
    Selected 5 top assignees for this iteration.
    Size of dataset for CASSANDRA with 5 assignees after undersampling: 400 issues.
    Writing data to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\2_CASSANDRA_5_assignees.csv
    Successfully saved: 2_CASSANDRA_5_assignees.csv

Processing project: MESOS
  Considering top 5 assignees...
    Selected 5 top assignees for this iteration.
    Size of dataset for MESOS with 5 assignees after undersampling: 400 issues.
    Writing data to: C:\Users\hp\Desktop\Module-3-Task-a