In [1]:
import os
import numpy as np
import json
import pandas as pd
from parameters import trainable_data_folder, embeddings_folder 

In [2]:
# --- Configuration ---
EMBEDDING_DIM = 100  # Must match the GloVe file used
MAX_NUM_WORDS = 20000 # Vocabulary size for the embedding layer
GLOVE_FILE_PATH = os.path.join(r"C:\Users\hp\Desktop\Module-3-Task-assigning\glove", 'glove.6B.100d.txt')

In [6]:
def load_glove_embeddings(glove_path):
    print(f"Loading GloVe embeddings from: {glove_path}")
    embeddings_index = {}
    try:
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
    except FileNotFoundError:
        print(f"[ERROR] GloVe file not found at '{glove_path}'. Please ensure the file exists.")
        return None
    print(f"Found {len(embeddings_index)} word vectors.")
    return embeddings_index

In [7]:
embeddings_index = load_glove_embeddings(GLOVE_FILE_PATH)

Loading GloVe embeddings from: C:\Users\hp\Desktop\Module-3-Task-assigning\glove\glove.6B.100d.txt
Found 400000 word vectors.


In [10]:
if embeddings_index:
    print("Starting embedding matrix creation...")
    
    feature_dirs = [d for d in os.listdir(embeddings_folder) if d.startswith("4_NN_")]
    
    for feature_dir in feature_dirs:
        print(f"Processing: {feature_dir}")
        full_path = os.path.join(embeddings_folder, feature_dir)
        
        try:
            # Load the specific word_index for this dataset
            with open(os.path.join(full_path, 'word_index.json'), 'r') as f:
                word_index = json.load(f)

            # --- Create Embedding Matrix ---
            print("  Creating embedding matrix...")
            num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
            embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

            for word, i in word_index.items():
                if i >= MAX_NUM_WORDS:
                    continue
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            
            # --- Save the Matrix ---
            matrix_path = os.path.join(full_path, 'embedding_matrix.npy')
            np.save(matrix_path, embedding_matrix)
            print(f"  Successfully saved embedding matrix to: {matrix_path}")

        except FileNotFoundError:
            print(f"  [Warning] 'word_index.json' not found in {feature_dir}. Skipping.")
        except Exception as e:
            print(f"  [ERROR] An unexpected error occurred while processing {feature_dir}: {e}")

print("--- All embedding matrices have been created. ---")

Starting embedding matrix creation...
Processing: 4_NN_AMBARI_5_assignees_features
  Creating embedding matrix...
  Successfully saved embedding matrix to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_AMBARI_5_assignees_features\embedding_matrix.npy
Processing: 4_NN_ARROW_5_assignees_features
  Creating embedding matrix...
  Successfully saved embedding matrix to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_ARROW_5_assignees_features\embedding_matrix.npy
Processing: 4_NN_CASSANDRA_5_assignees_features
  Creating embedding matrix...
  Successfully saved embedding matrix to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_CASSANDRA_5_assignees_features\embedding_matrix.npy
Processing: 4_NN_CB_5_assignees_features
  Creating embedding matrix...
  Successfully saved embedding matrix to: C:\Users\hp\Desktop\Module-3-Task-assigning\data\embeddings\4_NN_CB_5_assignees_features\embedding_matrix.npy
Processing: 4_NN_DATALAB_5_assignees_f