In [1]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
MAX_NUM_WORDS = 20000  # The maximum number of words to keep in the vocabulary
MAX_SEQUENCE_LENGTH = 250 # The fixed length for all sequences
from parameters import trainable_data_folder, embeddings_folder

In [5]:
print("\\nStarting to process files for Neural Network input...")

for f in os.listdir(trainable_data_folder):
    if f.startswith("3.2_") and f.endswith("_processed.csv"):
        print(f"\\nProcessing file: {f}")

        try:
            # --- 1. Load Data ---
            input_file_path = os.path.join(trainable_data_folder, f)
            df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')

            # Extract project and assignee info from filename
            parts = f.replace(".csv", "").split("_")
            project_name = parts[1]
            num_assignees_config = int(parts[2])

            # --- Create Output Directory ---
            output_dir_name = f"4_NN_{project_name}_{num_assignees_config}_assignees_features"
            full_output_path = os.path.join(embeddings_folder, output_dir_name)
            os.makedirs(full_output_path, exist_ok=True)
            print(f"  Output directory: {full_output_path}")

            # --- 2. Combine Text and Prepare Data ---
            df['text'] = df['processed_title'].fillna('') + ' ' + df['processed_description'].fillna('')
            
            X = df['text'].values
            y = df['assignee_id'].values

            # --- 3. Train-Test Split ---
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print(f"  Data split. Training set: {len(X_train)}, Test set: {len(X_test)}")

            # --- 4. Tokenization ---
            tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
            tokenizer.fit_on_texts(X_train)
            
            X_train_sequences = tokenizer.texts_to_sequences(X_train)
            X_test_sequences = tokenizer.texts_to_sequences(X_test)
            
            word_index = tokenizer.word_index
            print(f"  Found {len(word_index)} unique tokens.")

            # --- 5. Padding ---
            X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
            X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
            print(f"  Sequences padded to length {MAX_SEQUENCE_LENGTH}.")

            # --- 6. Save Processed Data as CSV ---
            # Convert arrays to DataFrames and save as CSV. No headers or index for feature matrices.
            pd.DataFrame(X_train_padded).to_csv(os.path.join(full_output_path, 'X_train.csv'), index=False, header=False)
            pd.DataFrame(X_test_padded).to_csv(os.path.join(full_output_path, 'X_test.csv'), index=False, header=False)
            
            # Save labels with a header for clarity
            pd.DataFrame(y_train, columns=['assignee_id']).to_csv(os.path.join(full_output_path, 'y_train.csv'), index=False)
            pd.DataFrame(y_test, columns=['assignee_id']).to_csv(os.path.join(full_output_path, 'y_test.csv'), index=False)
            
            # The word_index is a dictionary and is best saved as JSON.
            with open(os.path.join(full_output_path, 'word_index.json'), 'w') as f:
                json.dump(word_index, f)

            print(f"  Successfully saved processed data for {project_name}.")

        except Exception as e:
            print(f"  [ERROR] Failed to process {f}: {e}")

print("\\nScript finished processing all files.")


\nStarting to process files for Neural Network input...
\nProcessing file: 3.2_AMBARI_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_AMBARI_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2000 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for AMBARI.
\nProcessing file: 3.2_ARROW_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_ARROW_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 1874 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for ARROW.
\nProcessing file: 3.2_CASSANDRA_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_CASSANDRA_5_assignees_features
  Data split. Training set: 320, Test set: 80


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Found 2359 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for CASSANDRA.
\nProcessing file: 3.2_CB_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_CB_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2625 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for CB.
\nProcessing file: 3.2_DATALAB_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_DATALAB_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 992 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for DATALAB.
\nProcessing file: 3.2_FLINK_10_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_FLINK_10_assignees_features
  Data split. Training set: 640, Test set: 160


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Found 3179 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for FLINK.
\nProcessing file: 3.2_FLINK_15_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_FLINK_15_assignees_features
  Data split. Training set: 960, Test set: 240
  Found 4283 unique tokens.
  Sequences padded to length 250.


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Successfully saved processed data for FLINK.
\nProcessing file: 3.2_FLINK_20_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_FLINK_20_assignees_features
  Data split. Training set: 1280, Test set: 320
  Found 5081 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for FLINK.
\nProcessing file: 3.2_FLINK_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_FLINK_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 1856 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for FLINK.
\nProcessing file: 3.2_GEODE_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_GEODE_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2191 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed

  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Found 2194 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for HDDS.
\nProcessing file: 3.2_HIVE_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_HIVE_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2406 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for HIVE.
\nProcessing file: 3.2_HUDI_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_HUDI_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2075 unique tokens.
  Sequences padded to length 250.


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Successfully saved processed data for HUDI.
\nProcessing file: 3.2_IGNITE_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_IGNITE_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2376 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for IGNITE.
\nProcessing file: 3.2_IMPALA_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_IMPALA_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2414 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for IMPALA.
\nProcessing file: 3.2_IOTDB_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_IOTDB_5_assignees_features
  Data split. Training set: 320, Test set: 80


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Found 1538 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for IOTDB.
\nProcessing file: 3.2_MESOS_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_MESOS_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2624 unique tokens.
  Sequences padded to length 250.


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')


  Successfully saved processed data for MESOS.
\nProcessing file: 3.2_OAK_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_OAK_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 2145 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for OAK.
\nProcessing file: 3.2_SPARK_5_assignees_processed.csv
  Output directory: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_SPARK_5_assignees_features
  Data split. Training set: 320, Test set: 80
  Found 1824 unique tokens.
  Sequences padded to length 250.
  Successfully saved processed data for SPARK.
\nScript finished processing all files.


  df = pd.read_csv(input_file_path, sep='\\t', encoding='utf-8')
