In [1]:
import os
import re
import spacy
import numpy
import pandas as pd
from spacy.tokens import Doc
from spacy.matcher import Matcher
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, LEMMA
from parameters import data_out_folder

In [6]:
# --- 1. Load spaCy Model ---
try:
    nlp = spacy.load("en_core_web_md")
    print("spaCy model 'en_core_web_md' loaded successfully.")
except OSError:
    print("spaCy model 'en_core_web_md' not found. Downloading...")
    import spacy.cli
    spacy.cli.download("en_core_web_md")
    nlp = spacy.load("en_core_web_md")
    print("Model downloaded and loaded successfully.")

# Pre-compile regex for HTML/XML tags for efficiency
TAG_RE = re.compile(r'<[^>]+>')

spaCy model 'en_core_web_md' loaded successfully.


In [7]:
def preprocess_text_field(text):
    """
    Applies preprocessing to a single text field (title or description)
    as per Section 3.2 of the paper.
    """
    if not isinstance(text, str):
        return "" # Return empty string if input is not a string (e.g., NaN)

    # 1. Remove HTML and XML tags
    text = TAG_RE.sub('', text)

    # Process with spaCy
    doc = nlp(text)
    
    processed_tokens = []
    for token in doc:
        # 3. Discard punctuation and special characters (is_alpha checks for letters)
        #    Stopword removal and length check are also done here.
        if (not token.is_punct and
            not token.is_space and # Explicitly remove space tokens
            not token.is_stop and  # 4. Discard stopwords
            len(token.text) >= 3 and # 5. Discard words with less than three characters
            token.is_alpha):        # Ensure it's alphabetic (removes numbers, most special chars)
            
            # 6. Convert to lowercase and 7. Lemmatize
            processed_tokens.append(token.lemma_.lower())
            
    return " ".join(processed_tokens)

In [8]:
def process_labels(labels_str):
    """
    Processes the labels string. Section 3.2 states no preprocessing needed as
    they are 'already split into clear terms/tokens'.
    If labels_str is a comma-separated string from previous steps,
    this function will split and clean them.
    """
    if not isinstance(labels_str, str) or pd.isna(labels_str):
        return [] # Return empty list if no labels or NaN
    
    # Split if it's a comma-separated string, then strip whitespace from each label
    # Assumes labels might have been joined by ", " in a previous step
    tokens = [label.strip().lower() for label in labels_str.split(',') if label.strip()]
    # Further cleaning could be added if individual labels have noise
    return tokens # Returns a list of clean label strings


In [9]:
PRIORITY_MAP = {
    # Scale 1: Blocker-Critical-Major-Minor-Trivial
    "blocker": 1, "p0": 1,
    "critical": 2, "p1": 2,
    "major": 3, "p2": 3,
    "minor": 4, "p3": 4,
    "trivial": 5, "p4": 5,
    # Add other variations if they exist in your data
}

def convert_priority_to_id(priority_name):
    if not isinstance(priority_name, str) or pd.isna(priority_name):
        return None # Or a default ID, e.g., 3 for Major
    return PRIORITY_MAP.get(priority_name.lower().strip(), None) # Return None if not found

# Using Table 2 from the provided PDF
ISSUE_TYPE_MAP = {
    "task": 1,
    "bug": 2,
    "sub-task": 3, "subtask": 3, # Common variation
    "support patch": 4,
    "feature request": 5,
    "enhancement": 6,
    "component upgrade": 7,
    "quality risk": 8,
    "patch": 9,
    "library upgrade": 10,
    "clarification": 11,
    "epic": 12,
    "tracker": 13,
    "story": 14,
    # Add other types from your data, mapping them appropriately or to a default/unknown ID
    "new feature": 5, # "New Feature" was type 5 in some contexts (e.g. Figure 2 for Cassandra)
                      # The paper's Table 2 lists "Feature request" as 5.
                      # This mapping needs to be consistent with your data's actual type names.
}

def convert_type_to_id(type_name):
    if not isinstance(type_name, str) or pd.isna(type_name):
        return None # Or a default ID
    return ISSUE_TYPE_MAP.get(type_name.lower().strip(), None)

In [10]:
print(f"Starting preprocessing for files in: {data_out_folder}")

# Ensure output subfolder exists
output_subfolder = os.path.join(data_out_folder, "step3_2_processed_data")
os.makedirs(output_subfolder, exist_ok=True)
print(f"Output will be saved in: {output_subfolder}")

Starting preprocessing for files in: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output
Output will be saved in: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\step3_2_processed_data


In [11]:

files_processed_count = 0
for f in os.listdir(data_out_folder):
    if f.startswith("2_") and f.endswith(".csv"): # Process files from step 2
        print(f"\nProcessing file: {f}")
        try:
            # Extract project_name and num_assignees from filename like "2_PROJECT_N_assignees.csv"
            parts = f.replace(".csv", "").split("_")
            if len(parts) < 4 : # Basic check for filename format
                print(f"  Skipping {f}: Filename format incorrect.")
                continue
            project_name = parts[1]
            num_assignees = parts[2] # num_assignees might not be directly used in this script but is part of filename
        except IndexError:
            print(f"  Skipping {f}: Could not parse project name or assignee number from filename.")
            continue

        input_filepath = os.path.join(data_out_folder, f)
        df = pd.read_csv(input_filepath, sep='\t', encoding='utf-8')

        # Drop unnamed columns if they exist
        df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True, errors='ignore')

        # a. Apply Text Preprocessing (Title and Description)
        print("  Preprocessing 'title'...")
        df['processed_title'] = df['title'].astype(str).apply(preprocess_text_field)
        print("  Preprocessing 'description'...")
        df['processed_description'] = df['description'].astype(str).apply(preprocess_text_field)

        # b. Process Labels (convert to list of clean tokens)
        # Assuming 'labels' column from "1_mongo.csv" was carried over as a string.
        # If it's named 'labels_str' from previous steps, use that.
        labels_column_name = 'labels' if 'labels' in df.columns else 'labels_str'
        if labels_column_name in df.columns:
            print(f"  Processing '{labels_column_name}'...")
            df['processed_labels'] = df[labels_column_name].apply(process_labels)
        else:
            print(f"  Warning: Label column ('labels' or 'labels_str') not found in {f}. Skipping label processing.")
            df['processed_labels'] = pd.Series([[] for _ in range(len(df))], index=df.index)


        # c. Convert Non-Textual Data to IDs
        #   Priority: Assumes a column like 'priority_name' exists from "1_mongo.csv"
        if 'priority_name' in df.columns:
            print("  Converting 'priority_name' to 'priority_id_mapped'...")
            df['priority_id_mapped'] = df['priority_name'].apply(convert_priority_to_id)
        else:
            print(f"  Warning: 'priority_name' column not found in {f}. Cannot map priority IDs.")
            # If 'priority_id' already exists and is numeric, you might want to keep it.
            # For now, we'll create the column so it exists for downstream consistency if needed.
            if 'priority_id' not in df.columns:
                 df['priority_id_mapped'] = None


        #   Type: Assumes a column like 'type_name' exists from "1_mongo.csv"
        if 'type_name' in df.columns:
            print("  Converting 'type_name' to 'type_id_mapped'...")
            df['type_id_mapped'] = df['type_name'].apply(convert_type_to_id)
        else:
            print(f"  Warning: 'type_name' column not found in {f}. Cannot map type IDs.")
            if 'type_id' not in df.columns:
                df['type_id_mapped'] = None

        #   Assignee ID: Assumed to be already present as 'assignee_id' from the "2_..." files.
        #   If it was 'assignee' (string name), you would do:
        #   df['assignee_id_mapped'] = df['assignee'].rank(method='dense').astype(int)
        if 'assignee_id' not in df.columns and 'assignee' in df.columns:
            print("  Generating 'assignee_id' from 'assignee' column...")
            df['assignee_id'] = df['assignee'].rank(method='dense').astype(int)
        elif 'assignee_id' in df.columns:
            print("  'assignee_id' column already present.")
        else:
            print(f"  Warning: Neither 'assignee_id' nor 'assignee' found. Cannot ensure assignee ID.")


        # d. Select and reorder columns for the output
        # Choose which columns to keep. Original IDs or mapped IDs? Processed text or original?
        # This example keeps processed text and mapped IDs where available.
        output_columns = []
        for col in ['id', 'project_name', 'assignee_id', # Base IDs
                    'processed_title', 'processed_description', 'processed_labels', # Processed text & labels
                    'priority_id_mapped', 'type_id_mapped', # Mapped categorical IDs
                    'status_name', # Example of other categorical data that might be useful
                   ]: # Add any other original columns you want to keep
            if col in df.columns:
                output_columns.append(col)
            elif col.replace('_mapped','') in df.columns: # If mapped version doesn't exist, use original ID if present
                 output_columns.append(col.replace('_mapped',''))


        df_processed = df[output_columns].copy()


        # Save processed DataFrame
        output_filename = f"3.2_{project_name}_{num_assignees}_assignees_processed.csv"
        output_filepath = os.path.join(output_subfolder, output_filename)
        
        try:
            df_processed.to_csv(output_filepath, sep='\t', encoding='utf-8', index=False)
            print(f"  Successfully saved processed data to: {output_filepath}")
            files_processed_count += 1
        except Exception as e:
            print(f"  Error saving processed CSV file {output_filename}: {e}")

if files_processed_count > 0:
    print(f"\nSuccessfully processed and saved {files_processed_count} files to '{output_subfolder}'.")
else:
    print("\nNo files were processed. Check filenames in datafolder start with '2_' and end with '.csv'.")


Processing file: 2_AMBARI_5_assignees.csv
  Preprocessing 'title'...
  Preprocessing 'description'...
  Processing 'labels'...
  'assignee_id' column already present.
  Successfully saved processed data to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\step3_2_processed_data\3.2_AMBARI_5_assignees_processed.csv

Processing file: 2_ARROW_5_assignees.csv
  Preprocessing 'title'...
  Preprocessing 'description'...
  Processing 'labels'...
  'assignee_id' column already present.
  Successfully saved processed data to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\step3_2_processed_data\3.2_ARROW_5_assignees_processed.csv

Processing file: 2_CASSANDRA_5_assignees.csv
  Preprocessing 'title'...
  Preprocessing 'description'...
  Processing 'labels'...
  'assignee_id' column already present.
  Successfully saved processed data to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output\step3_2_processed_data\3.2_CASSANDRA_5_assignees_processed.csv

Processi