In [1]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Load the dataset
df = pd.read_csv("spam_or_not_spam.csv")
#df = pd.read_csv("NLP.csv")

# # Columns: 'email' (text), 'label' (0 or 1)
emails = df['email'].tolist()


In [2]:
import string
import nltk
import re
import random
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from sklearn.utils import resample


nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab')
nltk.download('words', quiet=True)

# Get a set of valid English words
valid_words = set(words.words())

# Step 4: Preprocessing function to clean and tokenize the text
def preprocess_text(text,subsample_threshold=1e-3):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()

        # Remove special characters (anything other than alphanumeric and spaces)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove extra whitespace
        text = ' '.join(text.split())
        # Tokenize the text
        tokens = word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        # Remove short words (length < 2)
        tokens = [word for word in tokens if len(word) > 2]

         # Remove consecutive duplicate words
        tokens = [word for i, word in enumerate(tokens) if i == 0 or word != tokens[i - 1]]

        # Remove meaningless words (long words or non-dictionary words)
        tokens = [
            word for word in tokens
            if len(word) <= 15 and (word in valid_words or re.match(r'^[a-zA-Z]+$', word))
        ]

        #Subsampling step: Remove high-frequency words based on the threshold
        word_counts = {word: tokens.count(word) for word in set(tokens)}
        total_count = sum(word_counts.values())

        def subsample(word):
            freq = word_counts[word] / total_count
            prob_keep = (freq / subsample_threshold) ** 0.5 + subsample_threshold / freq
            return random.random() < min(prob_keep, 1.0)

        tokens = [word for word in tokens if subsample(word)]

        return tokens

    return []

# Apply the preprocessing function to the email column
df['email'] = df['email'].apply(preprocess_text)

# Step 5: Balancing the dataset
# Separate majority and minority classes
df_majority = df[df['label'] == 0]
df_minority = df[df['label'] == 1]

# Option 1: Oversample the minority class
df_minority_oversampled = resample(
    df_minority,
    replace=True,  # Sample with replacement
    n_samples=len(df_majority),  # Match the majority class count
    random_state=42  # Reproducibility
)

# Combine oversampled minority class with the majority class
df_balanced = pd.concat([df_majority, df_minority_oversampled])


# Shuffle the dataset to mix the samples
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Display the first few rows after preprocessing and balancing
print(df_balanced.head(25))
print(f"Remaining rows after cleaning and balancing: {len(df_balanced)}")



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                                email  label
0   [little, confused, running, procmail, gateway,...      0
1   [professional, effective, debt, collection, se...      1
2   [copy, dvd, burner, dvd, wizard, pro, technolo...      1
3   [thu, sep, number, numberpm, number, axel, thi...      0
4   [reminds, cheney, debates, declared, wealth, p...      0
5   [david, neary, said, francophones, among, arti...      0
6   [john, hall, wrote, stephen, williams, mailto,...      0
7   [url, loo, rolls, honour, invisible, man, pape...      0
8   [url, date, number, numbertnumber, number, new...      0
9   [date, mon, number, sep, number, edt, dayv, ga...      0
10  [timh, possibly, different, shauna, lowery, on...      0
11  [url, close, encounter, burnt, kind, came, out...      0
12  [fork, admin, url, mailto, fork, admin, url, b...      0
13  [friday, september, number, jim, whitehead, wr...      0
14  [hyperlink, url, teamed, hyperlink, foundmoney...      1
15  [dear, user, cyberag

In [3]:
# Remove rows with None values in the 'email' column
df_balanced = df_balanced[df_balanced['email'].notnull()]
df_balanced.reset_index(drop=True, inplace=True)
print(df_balanced[df_balanced['email'].isnull()])




Empty DataFrame
Columns: [email, label]
Index: []


In [4]:
# Flatten all tokenized words to create a unique list of words in the dataset
all_words = [word for tokens in df_balanced['email'] for word in tokens]

# Create a set of unique words
unique_words = set(all_words)

# Create word-to-index mapping
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

# Create index-to-word reverse mapping
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Example: Apply word-to-index mapping to each tokenized email
df_balanced['email_indexed'] = df_balanced['email'].apply(lambda tokens: [word_to_index[word] for word in tokens])

# Display the dataset with tokenized and indexed emails
print("\nDataset with Tokenized and Indexed Emails:")
print(df_balanced[['email', 'email_indexed']].head())

# Display example mappings
print("\nWord-to-Index Mapping (First 10 words):")
print(dict(list(word_to_index.items())[:10]))

print("\nIndex-to-Word Mapping (First 10 indices):")
print(dict(list(index_to_word.items())[:10]))



Dataset with Tokenized and Indexed Emails:
                                               email  \
0  [little, confused, running, procmail, gateway,...   
1  [professional, effective, debt, collection, se...   
2  [copy, dvd, burner, dvd, wizard, pro, technolo...   
3  [thu, sep, number, numberpm, number, axel, thi...   
4  [reminds, cheney, debates, declared, wealth, p...   

                                       email_indexed  
0  [7596, 14016, 21706, 1450, 11619, 9932, 16350,...  
1  [2554, 22666, 20683, 8753, 13897, 23410, 17158...  
2  [24793, 15806, 14299, 15806, 19378, 21902, 45,...  
3  [19193, 18851, 18219, 26221, 18219, 4555, 2369...  
4  [24330, 12, 1761, 26010, 728, 26840, 17029, 24...  

Word-to-Index Mapping (First 10 words):
{'invaders': 0, 'barrett': 1, 'costly': 2, 'cobalt': 3, 'qed': 4, 'highlight': 5, 'pheasants': 6, 'mort': 7, 'finessed': 8, 'wacky': 9}

Index-to-Word Mapping (First 10 indices):
{0: 'invaders', 1: 'barrett', 2: 'costly', 3: 'cobalt', 4: 'qed', 5: 

In [5]:
import random

def create_lookup_tables(words):

    # Get unique words in the dataset
    unique_words = set(words)

    # Create word-to-index mapping
    vocab_to_int = {word: idx for idx, word in enumerate(unique_words)}

    # Create index-to-word mapping
    int_to_vocab = {idx: word for word, idx in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

all_words = [word for email in df_balanced['email'] for word in email]

# Step 1: Create lookup tables
vocab_to_int, int_to_vocab = create_lookup_tables(all_words)

# Step 2: Convert words to integers
int_words = [vocab_to_int[word] for word in all_words]

# Step 3: Define the function to generate target-context pairs
def generate_target_context_pairs(words, window_size=2):
    pairs = []

    for idx in range(len(words)):
        # Get context words for the current target word
        context_words = get_target(words, idx, window_size)

        # Add (target, context) pairs
        for context_word in context_words:
            pairs.append((words[idx], context_word))

    return pairs

def get_target(words, idx, window_size=2):

    # Choose a random number for the window size
    R = random.randint(1, window_size)

    # Calculate start and end indices for the context window
    start = max(0, idx - R)  # Ensure it doesn't go below index 0
    end = min(len(words), idx + R + 1)  # Ensure it doesn't exceed the list length

    # Exclude the target word itself from the context
    context = [words[i] for i in range(start, end) if i != idx]

    return context

# Step 4: Generate target-context pairs
window_size = 2  # Set the window size
target_positive_pairs = generate_target_context_pairs(int_words, window_size)

# Step 5: Map the pairs back to words for validation (optional)
word_pairs = [(int_to_vocab[target], int_to_vocab[context]) for target, context in target_positive_pairs]

# Output results
print(f"First 20 target-context pairs (as integers): {target_positive_pairs[:20]}")
print(f"First 20 target-context pairs (as words): {word_pairs[:20]}")


First 20 target-context pairs (as integers): [(7596, 14016), (14016, 7596), (14016, 21706), (14016, 1450), (21706, 14016), (21706, 1450), (1450, 14016), (1450, 21706), (1450, 11619), (1450, 9932), (11619, 21706), (11619, 1450), (11619, 9932), (11619, 16350), (9932, 1450), (9932, 11619), (9932, 16350), (9932, 6068), (16350, 9932), (16350, 6068)]
First 20 target-context pairs (as words): [('little', 'confused'), ('confused', 'little'), ('confused', 'running'), ('confused', 'procmail'), ('running', 'confused'), ('running', 'procmail'), ('procmail', 'confused'), ('procmail', 'running'), ('procmail', 'gateway'), ('procmail', 'sits'), ('gateway', 'running'), ('gateway', 'procmail'), ('gateway', 'sits'), ('gateway', 'external'), ('sits', 'procmail'), ('sits', 'gateway'), ('sits', 'external'), ('sits', 'sendmail'), ('external', 'sits'), ('external', 'sendmail')]


In [8]:
import random

def generate_target_negative_pairs(words, window_size, vocab_size, num_negatives=1):

    pairs = []

    for idx in range(len(words)):
        # Get context words for the current target word
        context_words = get_target(words, idx, window_size)

        # Generate negative samples for the current target word
        negative_samples = generate_negative_samples(words[idx], context_words, vocab_size, num_negatives)

        # Add (target, negative_sample) pairs
        for negative_sample in negative_samples:
            pairs.append((words[idx], negative_sample))

    return pairs

# Step 6: Generate target-negative pairs
num_negatives = 2  # Number of negative samples per target word
vocab_size = len(vocab_to_int)
target_negative_pairs = generate_target_negative_pairs(int_words, window_size, vocab_size, num_negatives)

# Step 7: Map the pairs back to words for validation (optional)
negative_word_pairs = [(int_to_vocab[target], int_to_vocab[negative]) for target, negative in target_negative_pairs]

# Output results
print(f"First 20 target-negative pairs (as integers): {target_negative_pairs[:20]}")
print(f"First 20 target-negative pairs (as words): {negative_word_pairs[:20]}")


First 20 target-negative pairs (as integers): [(7596, 21505), (7596, 15826), (14016, 21152), (14016, 7952), (21706, 25470), (21706, 13895), (1450, 20697), (1450, 16342), (11619, 28788), (11619, 10125), (9932, 416), (9932, 18827), (16350, 25672), (16350, 14118), (6068, 21293), (6068, 9227), (18662, 15534), (18662, 22518), (16711, 27131), (16711, 9492)]
First 20 target-negative pairs (as words): [('little', 'marketer'), ('little', 'fallacies'), ('confused', 'vipul'), ('confused', 'zstnumbernl'), ('running', 'wagfm'), ('running', 'soundtracks'), ('procmail', 'redesign'), ('procmail', 'taller'), ('gateway', 'routines'), ('gateway', 'restoring'), ('sits', 'caller'), ('sits', 'experimenting'), ('external', 'slowdown'), ('external', 'aussi'), ('sendmail', 'modest'), ('sendmail', 'matrise'), ('box', 'kmart'), ('box', 'hypocrite'), ('internal', 'shelf'), ('internal', 'rockville')]


In [7]:
import numpy as np

# Step 1: Define vocabulary size and embedding size
vocab_size = len(vocab_to_int)  # Vocabulary size
embedding_size = 10  # Embedding size

# Step 2: Initialize input and output embedding matrices

input_embeddings = np.random.randn(vocab_size, embedding_size) * np.sqrt(1.0 / embedding_size)
output_embeddings = np.random.randn(vocab_size, embedding_size) * np.sqrt(1.0 / embedding_size)


# Step 3: Map each word to its input and output embeddings
word_embeddings = {
    word: {
        "input_embedding": input_embeddings[idx],
        "output_embedding": output_embeddings[idx]
    }
    for word, idx in vocab_to_int.items()
}

# Step 4: Display the embeddings for each word
for word, embeddings in list(word_embeddings.items())[:10]:  # Display first 10 words for brevity
    print(f"Word: {word}")
    print(f"  Input Embedding: {embeddings['input_embedding']}")
    print(f"  Output Embedding: {embeddings['output_embedding']}")
    print()


Word: invaders
  Input Embedding: [-0.1239307  -0.05769856 -0.16550575 -0.56098908  0.0765693  -0.34209149
 -0.27994673  0.13877545  0.14631424 -0.0744798 ]
  Output Embedding: [-0.22807688 -0.2117064  -0.35187176  0.29116289  0.4408401  -0.46096212
 -0.25081615  0.01564977 -0.44700113 -0.02784587]

Word: barrett
  Input Embedding: [-0.07625972  0.46759018  0.16753663 -0.54241619  0.01558911 -0.06017789
  0.3319545  -0.09089748 -0.28451984 -0.49295937]
  Output Embedding: [-0.02050714  0.1911117   0.45352963  0.14200682  0.53828197 -0.02665664
  0.22704944  0.17383509  0.41068612  0.18189175]

Word: costly
  Input Embedding: [ 0.03818918  0.06494541 -0.02471297 -0.49083691 -0.31702135  0.13251529
  0.32333804 -0.23242308 -0.37813769 -0.04063725]
  Output Embedding: [ 0.76582878  0.08553003 -0.37483804 -0.42702389  0.28187911  0.07527064
 -0.28099755  0.26458772 -0.39312731  0.06565161]

Word: cobalt
  Input Embedding: [ 0.22215819 -0.35540817 -0.27475739  0.16592222 -0.1769304   0.2315

In [9]:
import numpy as np

# Sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Binary cross-entropy loss function
def binary_cross_entropy_loss(target_embedding, context_embedding, negative_embeddings):
    """
    Compute the binary cross-entropy loss for a positive target-context pair and negative samples.

    Args:
        target_embedding (ndarray): Embedding of the target word (1D array).
        context_embedding (ndarray): Embedding of the context word (1D array).
        negative_embeddings (ndarray): Embeddings of negative samples (2D array).

    Returns:
        float: Loss value.
    """
    # Positive pair contribution: log(σ(W_t · W_c'))
    positive_score = np.dot(target_embedding, context_embedding)
    positive_loss = -np.log(np.clip(sigmoid(positive_score), 1e-10, 1 - 1e-10))

    # Negative samples contribution: Σ log(1 - σ(W_t · W_n'))
    negative_scores = np.dot(negative_embeddings, target_embedding)
    negative_loss = -np.sum(np.log(np.clip(1 - sigmoid(negative_scores), 1e-10, 1 - 1e-10)))

    return positive_loss + negative_loss

# Gradient computation
def compute_gradients(target_embedding, context_embedding, negative_embeddings):
    """
    Compute gradients for target, context, and negative embeddings.

    Args:
        target_embedding (ndarray): Embedding of the target word (1D array).
        context_embedding (ndarray): Embedding of the context word (1D array).
        negative_embeddings (ndarray): Embeddings of negative samples (2D array).

    Returns:
        tuple: Gradients for target embedding, context embedding, and negative embeddings.
    """
    # Positive pair gradient
    positive_score = np.dot(target_embedding, context_embedding)
    positive_grad = sigmoid(positive_score) - 1

    # Negative samples gradients
    negative_scores = np.dot(negative_embeddings, target_embedding)
    negative_grads = sigmoid(negative_scores)

    # Gradients for embeddings
    grad_target = positive_grad * context_embedding + np.dot(negative_grads, negative_embeddings)
    grad_context = positive_grad * target_embedding
    grad_negatives = np.outer(negative_grads, target_embedding)

    return grad_target, grad_context, grad_negatives

def train_word2vec(input_embeddings, output_embeddings, target_context_pairs, learning_rate=0.001, num_epochs=10, num_negatives=10):
    """
    Train word embeddings using skip-gram negative sampling.

    Args:
        input_embeddings (ndarray): Input embedding matrix.
        output_embeddings (ndarray): Output embedding matrix.
        target_context_pairs (list): List of (target, context, negatives) samples.
        learning_rate (float): Learning rate for updates.
        num_epochs (int): Number of training epochs.
        num_negatives (int): Number of negative samples.

    Returns:
        tuple: Updated input and output embeddings.
    """
    losses = []  # Store loss for each epoch
    for epoch in range(num_epochs):
        total_loss = 0
        for target, context, negatives in target_context_pairs:
            target_embedding = input_embeddings[target]
            context_embedding = output_embeddings[context]
            negative_embeddings = output_embeddings[negatives]

            # Compute loss for current batch
            loss = binary_cross_entropy_loss(target_embedding, context_embedding, negative_embeddings)
            total_loss += loss

            # Compute gradients
            grad_target, grad_context, grad_negatives = compute_gradients(target_embedding, context_embedding, negative_embeddings)

            # # Check and print gradient magnitudes for validation
            # print(f"Epoch {epoch + 1}, Target Gradient Magnitude: {np.linalg.norm(grad_target):.4f}")
            # print(f"Epoch {epoch + 1}, Context Gradient Magnitude: {np.linalg.norm(grad_context):.4f}")
            # print(f"Epoch {epoch + 1}, Negative Gradients Mean Magnitude: {np.mean(np.linalg.norm(grad_negatives, axis=1)):.4f}")

            # Clip gradients for stability
            max_grad_norm = 5.0
            grad_target = np.clip(grad_target, -max_grad_norm, max_grad_norm)
            grad_context = np.clip(grad_context, -max_grad_norm, max_grad_norm)
            grad_negatives = np.clip(grad_negatives, -max_grad_norm, max_grad_norm)

            # Update embeddings with clipped gradients
            input_embeddings[target] -= learning_rate * grad_target
            output_embeddings[context] -= learning_rate * grad_context

            output_embeddings[negatives] -= learning_rate * grad_negatives

        losses.append(total_loss)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")


    return input_embeddings, output_embeddings

# Generate negative samples
def generate_negative_samples(target, context, vocab_size, num_negatives):
    """
    Generate negative samples for a given target-context pair.

    Args:
        target (int): Target word index.
        context (list): Context word indices.
        vocab_size (int): Size of the vocabulary.
        num_negatives (int): Number of negative samples.

    Returns:
        list: Indices of negative samples.
    """
    negatives = []
    while len(negatives) < num_negatives:
        negative = np.random.randint(0, vocab_size)
        if negative != target and negative not in context:
            negatives.append(negative)
    return negatives

# Prepare target-context-negative samples
target_context_pairs = [
    (target, context, generate_negative_samples(target, [context], vocab_size, num_negatives))
    for target, context in target_positive_pairs
]

# Train embeddings
input_embeddings, output_embeddings = train_word2vec(
    input_embeddings, output_embeddings, target_context_pairs, learning_rate=0.1, num_epochs=10
)


Epoch 1/10, Loss: 2298468.68246611
Epoch 2/10, Loss: 1720146.7123337728
Epoch 3/10, Loss: 1572834.5771514613
Epoch 4/10, Loss: 1491811.0746487412
Epoch 5/10, Loss: 1440082.3082391026
Epoch 6/10, Loss: 1403639.7626430301
Epoch 7/10, Loss: 1376270.9652690678
Epoch 8/10, Loss: 1354801.1189488587
Epoch 9/10, Loss: 1337426.4818242886
Epoch 10/10, Loss: 1323030.9029177143


In [12]:
import numpy as np

# Sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Binary cross-entropy loss using logistic regression
def logistic_loss(target_embedding, context_embedding, negative_embeddings):
    """
    Compute the binary cross-entropy loss for logistic regression.

    Args:
        target_embedding (ndarray): Embedding of the target word (1D array).
        context_embedding (ndarray): Embedding of the context word (1D array).
        negative_embeddings (ndarray): Embeddings of negative samples (2D array).

    Returns:
        float: Loss value.
    """
    # Logistic regression for the positive pair
    positive_score = np.dot(target_embedding, context_embedding)
    positive_loss = -np.log(sigmoid(positive_score))

    # Logistic regression for negative samples
    negative_scores = np.dot(negative_embeddings, target_embedding)
    negative_loss = -np.sum(np.log(1 - sigmoid(negative_scores)))

    return positive_loss + negative_loss

# Gradients computation for logistic regression
def compute_logistic_gradients(target_embedding, context_embedding, negative_embeddings):
    """
    Compute gradients using logistic regression for embeddings.

    Args:
        target_embedding (ndarray): Embedding of the target word (1D array).
        context_embedding (ndarray): Embedding of the context word (1D array).
        negative_embeddings (ndarray): Embeddings of negative samples (2D array).

    Returns:
        tuple: Gradients for target embedding, context embedding, and negative embeddings.
    """
    # Positive pair gradient
    positive_score = np.dot(target_embedding, context_embedding)
    positive_grad = sigmoid(positive_score) - 1  # Logistic regression gradient

    # Negative samples gradients
    negative_scores = np.dot(negative_embeddings, target_embedding)
    negative_grads = sigmoid(negative_scores)  # Logistic regression gradient

    # Gradients for embeddings
    grad_target = positive_grad * context_embedding + np.dot(negative_grads, negative_embeddings)
    grad_context = positive_grad * target_embedding
    grad_negatives = np.outer(negative_grads, target_embedding)

    return grad_target, grad_context, grad_negatives

# Training function using logistic regression
def train_word2vec_logistic(input_embeddings, output_embeddings, target_context_pairs, learning_rate=0.001, num_epochs=10):
    """
    Train word embeddings using logistic regression.

    Args:
        input_embeddings (ndarray): Input embedding matrix.
        output_embeddings (ndarray): Output embedding matrix.
        target_context_pairs (list): List of (target, context, negatives) samples.
        learning_rate (float): Learning rate for updates.
        num_epochs (int): Number of training epochs.

    Returns:
        tuple: Updated input and output embeddings.
    """
    for epoch in range(num_epochs):
        total_loss = 0
        for target, context, negatives in target_context_pairs:
            target_embedding = input_embeddings[target]
            context_embedding = output_embeddings[context]
            negative_embeddings = output_embeddings[negatives]

            # Compute loss using logistic regression
            loss = logistic_loss(target_embedding, context_embedding, negative_embeddings)
            total_loss += loss

            # Compute gradients using logistic regression
            grad_target, grad_context, grad_negatives = compute_logistic_gradients(
                target_embedding, context_embedding, negative_embeddings
            )

            # Update embeddings
            input_embeddings[target] -= learning_rate * grad_target
            output_embeddings[context] -= learning_rate * grad_context
            output_embeddings[negatives] -= learning_rate * grad_negatives

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")

    return input_embeddings, output_embeddings


# Prepare target-context-negative samples
target_context_pairs = [
    (target, context, generate_negative_samples(target, [context], vocab_size, num_negatives))
    for target, context in target_positive_pairs
]

# Train embeddings
input_embeddings, output_embeddings = train_word2vec(
    input_embeddings, output_embeddings, target_context_pairs, learning_rate=0.1, num_epochs=10
)



Epoch 1/10, Loss: 1432452.025717861
Epoch 2/10, Loss: 1365883.0506653918
Epoch 3/10, Loss: 1337835.1625172033
Epoch 4/10, Loss: 1319078.585979959
Epoch 5/10, Loss: 1304924.2288944614
Epoch 6/10, Loss: 1293592.3668012808
Epoch 7/10, Loss: 1284197.3004508128
Epoch 8/10, Loss: 1276227.8710016229
Epoch 9/10, Loss: 1269352.962823742
Epoch 10/10, Loss: 1263341.7671746141


In [None]:
# Compute average embedding for an email
def compute_email_embedding(email_words, word_to_index, input_embeddings):
    """
    Compute the average embedding for an email using the trained input embeddings.
    """
    embeddings = [
        input_embeddings[word_to_index[word]]
        for word in email_words if word in word_to_index
    ]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(input_embeddings.shape[1])  # Return a zero vector if no embeddings are found

# Validate word_to_index and input_embeddings
assert len(word_to_index) == input_embeddings.shape[0], "Mismatch between vocab size and embeddings"

# Debugging: Check for missing or invalid entries
for i, email_words in enumerate(df_balanced['email']):
    if not isinstance(email_words, list):
        print(f"Invalid entry at index {i}: {email_words}")
        df_balanced.at[i, 'email'] = []  # Fix invalid entries

# Compute embeddings for all emails
email_embeddings = []
for email_words in df_balanced['email']:
    avg_embedding = compute_email_embedding(email_words, word_to_index, input_embeddings)
    email_embeddings.append(avg_embedding)

# Add average embeddings back to the dataframe
df_balanced['email_embedding'] = email_embeddings

# Display a few examples
print(df_balanced[['email', 'email_embedding']])


                                               email  \
0  [date, wed, number, aug, number, chris, garrig...   
1  [martin, posted, tassos, papadopoulos, greek, ...   
2  [famous, ebay, marketing, course, learn, sell,...   
3  [man, threatens, explosion, moscow, thursday, ...   
4  [hello, chinese, traditional, number, mail, em...   
5  [famous, ebay, marketing, course, learn, sell,...   

                                     email_embedding  
0  [-0.45911303796340724, -0.00519616682674017, 0...  
1  [-0.4198114338272854, 0.2642645661129075, 0.56...  
2  [-1.1302753115775317, 0.021082054751102457, 0....  
3  [-0.0396018838966934, 0.3050853111678651, 0.29...  
4  [-0.47831360596207484, -0.012010683349433511, ...  
5  [-1.1302753115775317, 0.021082054751102457, 0....  


In [None]:
# Compute word-level embeddings for all words in emails
def compute_word_embeddings(email_words, word_to_index, input_embeddings):
    """
    Compute word-level embeddings for an email.

    Args:
        email_words (list): List of words in the email.
        word_to_index (dict): Mapping from words to their indices in the embedding matrix.
        input_embeddings (ndarray): Pretrained word embedding matrix.

    Returns:
        list: A list of embeddings for each word in the email.
    """
    word_embeddings = [
        (word, input_embeddings[word_to_index[word]])
        for word in email_words if word in word_to_index
    ]
    return word_embeddings  # Return a list of (word, embedding) pairs

# Validate word_to_index and input_embeddings
assert len(word_to_index) == input_embeddings.shape[0], "Mismatch between vocab size and embeddings"

# Debugging: Check for missing or invalid entries in the 'email' column
for i, email_words in enumerate(df_balanced['email']):
    if not isinstance(email_words, list):
        print(f"Invalid entry at index {i}: {email_words}")
        df_balanced.at[i, 'email'] = []  # Fix invalid entries

# Compute word-level embeddings for all emails
word_level_embeddings = []
for email_words in df_balanced['email']:
    word_embeddings = compute_word_embeddings(email_words, word_to_index, input_embeddings)
    word_level_embeddings.append(word_embeddings)

# Add word-level embeddings back to the dataframe
df_balanced['word_embeddings'] = word_level_embeddings

# Display a few examples
for idx, row in df_balanced.iterrows():
    print(f"Email {idx}:")
    for word, embedding in row['word_embeddings']:
        print(f"{word},{embedding[:10]}")  # Truncate to show first 5 dimensions
    print("-" * 30)


Email 0:
date,[-0.505616   -0.49504978  0.2019807  -0.54439907  0.00436119  0.47132624
 -0.57735065 -0.40342266 -0.12780292 -0.14067103]
wed,[-1.07247895  0.22661092  0.55047473 -0.47228384 -0.47411944  0.15178022
  0.15362575 -0.1790877  -0.11879345 -0.39602071]
number,[-0.99577303  0.16848591  2.31647472  0.35143189 -0.8973013  -0.15632074
  1.28901751  0.82778856 -0.56283815 -1.12739626]
aug,[-0.48845706 -0.5379804   0.54013707  0.02954859 -0.94868713  0.30357584
  0.64918871 -0.30705808 -0.22165795 -0.46877713]
number,[-0.99577303  0.16848591  2.31647472  0.35143189 -0.8973013  -0.15632074
  1.28901751  0.82778856 -0.56283815 -1.12739626]
chris,[-0.61611672  0.63778487  1.02983094 -0.1246203   0.22337898  0.28262449
 -0.03418504 -0.72261612  0.67704375 -0.30222511]
garrigues,[ 0.79722469  0.22629509  0.78138889  0.29329516 -0.25001451 -0.90040349
  0.99028823  0.81014726  0.61894276  0.13036146]
cwg,[ 0.84596684 -0.32152066  0.99691684 -0.4620348  -0.72420695 -0.26332673
  0.356018

In [None]:
# Display the first few rows of the DataFrame to check how the embeddings are stored
df_balanced[['email', 'word_embeddings']].head()

# Alternatively, print out the details for a specific email to see word-level embeddings
sample_index = 1
sample_email = df_balanced.iloc[sample_index]

# Displaying the email and its corresponding word embeddings
print("Email:", sample_email['email'])
print("\nWord-level embeddings:")
for word, embedding in sample_email['word_embeddings']:
    print(f"{word}: {embedding[:5]}")  # Display the first 5 dimensions of each embedding for brevity


Email: ['martin', 'posted', 'tassos', 'papadopoulos', 'greek', 'sculptor', 'behind', 'plan', 'judged', 'limestone', 'mount', 'kerdylio', 'number', 'miles', 'east', 'salonika', 'far', 'mount', 'athos', 'monastic', 'community', 'ideal', 'patriotic', 'sculpture', 'well', 'alexander', 'granite', 'features', 'number', 'high', 'number', 'wide', 'museum', 'restored', 'amphitheatre', 'car', 'park', 'admiring', 'crowds', 'planned', 'mountain', 'limestone', 'granite', 'limestone', 'weather', 'pretty', 'fast', 'yahoo', 'groups', 'sponsor', 'number', 'dvds', 'free', 'join', 'url', 'unsubscribe', 'group', 'send', 'email', 'forteana', 'unsubscribe', 'url', 'use', 'yahoo', 'groups', 'subject', 'url']

Word-level embeddings:
martin: [-0.69449536  0.62901929 -1.18663484  0.33166925 -0.6770544 ]
posted: [-0.64239538  0.05457745 -0.64726084  0.06615058 -0.1151798 ]
tassos: [-0.27451205  0.29956435  0.24833203 -1.50349818  0.14563529]
papadopoulos: [-0.09246871  0.4168348  -0.16695189  0.05913145  0.45848

In [None]:
def display_target_context_pairs(target_context_pairs, index_to_word, num_examples=10):
    """
    Display a specified number of target words and their corresponding context words.

    Args:
        target_context_pairs (list): List of tuples (target, context, negatives).
        index_to_word (dict): Mapping from word indices to words.
        num_examples (int): Number of target-context pairs to display.
    """
    print(f"{'Target Word':<15} | Context Word")
    print("-" * 30)

    # Display up to the specified number of examples
    for i, (target, context, _) in enumerate(target_context_pairs[:num_examples]):
        target_word = index_to_word[target]
        context_word = index_to_word[context]
        print(f"{target_word:<15} | {context_word}")

    if len(target_context_pairs) > num_examples:
        print(f"...and {len(target_context_pairs) - num_examples} more pairs.")


display_target_context_pairs(target_context_pairs, index_to_word, num_examples=20)


Target Word     | Context Word
------------------------------
date            | wed
wed             | date
wed             | number
wed             | aug
number          | date
number          | wed
number          | aug
number          | number
aug             | number
aug             | number
number          | aug
number          | chris
chris           | number
chris           | garrigues
garrigues       | number
garrigues       | chris
garrigues       | cwg
garrigues       | dated
cwg             | garrigues
cwg             | dated
...and 3738 more pairs.


Saving the Embeddings to a json file

In [None]:
import json
import numpy as np

# Function to recursively convert numpy.ndarrays to lists (if necessary)
def convert_to_list(embedding):
    if isinstance(embedding, np.ndarray):
        return embedding.tolist()  # Convert ndarray to list
    elif isinstance(embedding, list):
        return [convert_to_list(item) for item in embedding]  # Recursively convert if nested
    else:
        return embedding  # Return as is if it is neither ndarray nor list

# Convert word embeddings to list of lists (for serialization)
word_embeddings_list = [
    [(word, convert_to_list(embedding)) for word, embedding in email_embeddings]
    for email_embeddings in df_balanced['word_embeddings']
]

# Save to JSON file
with open('word_level_embeddings.json', 'w') as f:
    json.dump(word_embeddings_list, f)

# # Optional: Print the first email's embeddings for validation
# print(word_embeddings_list[0])


[('date', [-0.5056159994507785, -0.4950497796284715, 0.201980702111272, -0.5443990664497075, 0.004361190097984462, 0.47132624211814667, -0.5773506547109141, -0.40342266053239584, -0.12780291996173454, -0.14067102767172018]), ('wed', [-1.0724789485427495, 0.22661091541265346, 0.5504747264700611, -0.4722838394498229, -0.47411943664296996, 0.15178022062642885, 0.15362575478075324, -0.17908769874343178, -0.11879345173231338, -0.3960207104324439]), ('number', [-0.9957730293084713, 0.16848591113541175, 2.3164747227037914, 0.3514318851662449, -0.8973013010764975, -0.15632073509642413, 1.2890175132504393, 0.8277885647375579, -0.5628381532066609, -1.1273962638290778]), ('aug', [-0.48845705686218127, -0.5379804027048165, 0.5401370739563766, 0.02954858822167389, -0.9486871325193977, 0.30357583565993657, 0.6491887090214171, -0.30705808408213714, -0.2216579491828364, -0.46877712730312465]), ('number', [-0.9957730293084713, 0.16848591113541175, 2.3164747227037914, 0.3514318851662449, -0.897301301076