**Word2Vec Implementation**
----------------------------------
__________________________________

1. [Data Preprocessing](#Data-Preprocessing)
2. [Word2Vec Implementation](#Word2Vec-Implementation)
3. [Data Augmentation (Optional)](#Data-Augmentation-(Optional))

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re

## Data Preprocessing

In [None]:
# nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Convert all text to Lowercase
def lowercase(corpus):
    return corpus.str.lower()

# Remove Punctuation from the corpus
punct_and_nums = '''.!"#$%&()*+,-./:;<=>?@[\\]^_{|}~`''' + '0123456789'
def removepunct(corpus):
    translator = str.maketrans('', '', punct_and_nums)
    return corpus.apply(lambda x: x.translate(translator))

# Remove stopwords from sequences
def clearstopwords(sequences):
    filtered = []
    stopwords_ = stopwords.words('english')
    for sequence in sequences:
        seq = [word for word in sequence.split() if word not in stopwords_]
        filtered.append(" ".join(seq))
    return pd.Series(filtered)

def lemmatizetext(corpus):
    return corpus.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

def clearshortwords(corpus, min_len=3, max_len=17):
    lenfilter = lambda x: " ".join([w for w in x.split() if min_len <= len(w) <= max_len])
    return corpus.apply(lenfilter)

def clearnonascii(corpus):
    return corpus.apply(lambda x: ''.join([char for char in x if ord(char) < 128]))

def remove_urls(corpus):
    return corpus.apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))

def drop_short(corpus, min_length=2):
    corpus = corpus.dropna()
    return corpus[corpus.email.apply(lambda x: len(x.split()) > min_length)]

def stats(corpus):
    tokens = [word for sentence in corpus for word in sentence.split()]
    vocab = {word: idx for idx, word in enumerate(set(tokens))}
    return vocab, tokens

# Split sequences into individual words (tokenization)
def removespaces(corpus): 
    return corpus.apply(lambda x: " ".join(x.split()))
# Flatten a list of lists into a single list of words
def flatten(corpus): 
    return [word for sublist in corpus for word in sublist.split()]

# Main preprocessing pipeline
def preprocess(corpus):
    corpus = lowercase(corpus)
    print("Lowercased...")
    corpus = removepunct(corpus)
    print("Punctuations Removed...")
    corpus = clearshortwords(corpus)
    print("Short words Removed...")
    corpus = clearnonascii(corpus)
    print("Non ascii chars Removed...")
    corpus = remove_urls(corpus)
    print("URLs Removed...")
    corpus = removespaces(corpus)
    print("Extra Spaces Removed...")
    corpus = clearstopwords(corpus)
    print("Stopwords Removed...")
    corpus = lemmatizetext(corpus)
    print("Lemmatized Text...")
    return corpus

In [None]:
file = pd.read_csv('dataset/spam_or_not_spam.csv.csv')
file = file.drop_duplicates(subset='email', keep='first').dropna()
sequences, labels = file['email'], file['label']
sequences = preprocess(sequences)
vocab,tokens = stats(sequences)
file['email'] = sequences
file = drop_short(file)
print(len(tokens))
print(len(vocab))
print(len(file))
file.head()

Lowercased...
Punctuations Removed...
Short words Removed...
Non ascii chars Removed...
URLs Removed...
Extra Spaces Removed...
Stopwords Removed...
Lemmatized Text...
412737
26311
3038


Unnamed: 0,email,label
0,date wed number aug number number number numbe...,0
1,martin posted tasso papadopoulos greek sculpto...,0
2,man threatens explosion moscow thursday august...,0
3,klez virus die already prolific virus ever kle...,0
4,adding cream spaghetti carbonara effect pasta ...,0


In [None]:
file.to_csv("dataset/email_spam_augp.csv", index=False)

## Word2Vec Implementation

In [None]:
from collections import Counter
class Word2Vec():
    """A Word2Vec implementation using the Skip-gram model with negative sampling."""

    def __init__(self, embedding_size=10, window_size=2, num_negative=5, learning_rate=0.01):
        """Initializes Word2Vec with hyperparameters and placeholders for embeddings and vocabulary."""
        self.window_size = window_size
        self.num_negative = num_negative
        self.learning_rate = learning_rate
        self.embedding_size = embedding_size

        self.vocab_size = 0
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.target_embeds = None
        self.context_embeds = None
        self.word_freqs = None
        
        # Pre-compute sigmoid table for faster computation
        self.sigmoid_table = np.zeros(1000)
        for i in range(1000):
            self.sigmoid_table[i] = 1/(1 + np.exp(-(i/100 - 5)))
    
    def fast_sigmoid(self, x):
        """Faster sigmoid using pre-computed table"""
        x = 100 * x + 500  # Scale to [0,1000]
        x = np.clip(x, 0, 999).astype(int)
        return self.sigmoid_table[x]

    def build_vocab(self, sequences):
        """Builds vocabulary and initializes embeddings from a list of sequences."""
        # Process all sequences at once for better efficiency
        all_words = ' '.join(sequences).split()
        word_counts = Counter(all_words)
        
        # Build vocabulary
        self.word_to_idx = {word: idx for idx, word in enumerate(word_counts.keys())}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(word_counts)

        # Calculate frequencies
        total_words = len(all_words)
        self.word_freqs = np.zeros(self.vocab_size)
        for word, idx in self.word_to_idx.items():
            self.word_freqs[idx] = word_counts[word] / total_words

        # Initialize embeddings with better scaling
        scale = 0.5/self.embedding_size  # Better initialization
        self.target_embeds = np.random.uniform(-scale, scale, (self.vocab_size, self.embedding_size))
        self.context_embeds = np.random.uniform(-scale, scale, (self.vocab_size, self.embedding_size))
        
        # Pre-compute sampling table for negative sampling
        self.sampling_weights = np.power(self.word_freqs, 0.75)
        self.sampling_weights /= np.sum(self.sampling_weights)

    def negative_sampling(self, context_word):
        """Samples negative words for a given context word."""
        return np.random.choice(
            self.vocab_size, 
            size=self.num_negative, 
            p=self.sampling_weights
        )

    def train_step(self, target_idx, context_idx):
        """Performs a single training step for a target-context pair."""
        # Vectorized forward pass for positive sample
        target_embedding = self.target_embeds[target_idx]
        context_embedding = self.context_embeds[context_idx]
        pos_score = np.dot(target_embedding, context_embedding)
        pos_sigmoid = self.fast_sigmoid(pos_score)
        pos_loss = -np.log(pos_sigmoid + 1e-10)

        # Gradient for positive sample
        pos_grad = pos_sigmoid - 1
        target_grad = pos_grad * context_embedding
        context_grad = pos_grad * target_embedding

        # Get negative samples all at once
        neg_indices = self.negative_sampling(target_idx)
        neg_embeddings = self.context_embeds[neg_indices]
        
        # Vectorized forward pass for negative samples
        neg_scores = np.dot(target_embedding, neg_embeddings.T)
        neg_sigmoids = self.fast_sigmoid(neg_scores)
        neg_loss = -np.sum(np.log(1 - neg_sigmoids + 1e-10))

        # Vectorized gradient updates for negative samples
        for i, neg_idx in enumerate(neg_indices):
            neg_grad = neg_sigmoids[i]
            target_grad += neg_grad * self.context_embeds[neg_idx]
            self.context_embeds[neg_idx] -= self.learning_rate * (neg_grad * target_embedding)

        # Update embeddings
        self.target_embeds[target_idx] -= self.learning_rate * target_grad
        self.context_embeds[context_idx] -= self.learning_rate * context_grad

        return pos_loss + neg_loss

    def fit(self, corpus, epochs=5, batch_size=256):
        """Trains the Word2Vec model on the given corpus."""
        if not self.word_to_idx:
            self.build_vocab(corpus)

        # Pre-process corpus into indices
        corpus_indices = [
            [self.word_to_idx[word] for word in seq.split() if word in self.word_to_idx]
            for seq in corpus
        ]
        corpus_indices = [idx for idx in corpus_indices if len(idx) >= 2]

        for epoch in range(epochs):
            epoch_loss = 0
            print(f"\nEpoch {epoch+1}/{epochs}")
            
            # Split data into batches for training
            n_batches = max(1, len(corpus_indices) // batch_size)
            for batch in range(n_batches):
                batch_loss = 0
                start_idx = batch * batch_size
                end_idx = min((batch + 1) * batch_size, len(corpus_indices))
                
                # Process each sequence in the current batch
                for sequence in corpus_indices[start_idx:end_idx]:
                    for i, target_idx in enumerate(sequence):
                        # Define the context window around the target word
                        start = max(0, i-self.window_size)
                        end = min(len(sequence), i + self.window_size + 1)
                        context_indices = sequence[start:i] + sequence[i+1:end]
                        
                        # Train on all target-context pairs in the current window
                        for context_idx in context_indices:
                            loss = self.train_step(target_idx, context_idx)
                            batch_loss += loss
                
                # Accumulate epoch loss
                epoch_loss += batch_loss
                if batch % max(1, n_batches//10) == 0:
                    print(f"Batch {batch+1}/{n_batches} | Loss: {batch_loss/batch_size:.4f}")
            
            print(f"Epoch {epoch+1} complete | Average Loss: {epoch_loss/len(corpus_indices):.4f}")

    def get_embedding(self, word):
        """Retrieves the embedding for a given word."""
        if word not in self.word_to_idx: return None
        return self.target_embeds[self.word_to_idx[word]]
    
w2v = Word2Vec(embedding_size=10, window_size=2)

In [None]:
w2v.fit(sequences, epochs=5, batch_size=256)


Epoch 1/5
Batch 1/11 | Loss: 2072.5327
Batch 2/11 | Loss: 2502.3020
Batch 3/11 | Loss: 3685.1877
Batch 4/11 | Loss: 2408.3652
Batch 5/11 | Loss: 1332.7333
Batch 6/11 | Loss: 1282.1414
Batch 7/11 | Loss: 1442.9730
Batch 8/11 | Loss: 609.7445
Batch 9/11 | Loss: 508.1130
Batch 10/11 | Loss: 1283.5275
Batch 11/11 | Loss: 2566.7053
Epoch 1 complete | Average Loss: 1756.7064

Epoch 2/5
Batch 1/11 | Loss: 1482.3738
Batch 2/11 | Loss: 1817.2706
Batch 3/11 | Loss: 2736.1248
Batch 4/11 | Loss: 1846.1073
Batch 5/11 | Loss: 1003.8028
Batch 6/11 | Loss: 1016.0433
Batch 7/11 | Loss: 1179.7409
Batch 8/11 | Loss: 523.3938
Batch 9/11 | Loss: 443.7926
Batch 10/11 | Loss: 1095.4424
Batch 11/11 | Loss: 2228.1720
Epoch 2 complete | Average Loss: 1371.1846

Epoch 3/5
Batch 1/11 | Loss: 1333.8813
Batch 2/11 | Loss: 1636.2117
Batch 3/11 | Loss: 2487.8980
Batch 4/11 | Loss: 1697.7216
Batch 5/11 | Loss: 913.8961
Batch 6/11 | Loss: 934.6386
Batch 7/11 | Loss: 1099.5998
Batch 8/11 | Loss: 497.0571
Batch 9/11 | L

3888842.880670882

In [None]:
# Function to Save Word: Embedding pairs into file 
def save_embeddings(model, file_path='embeddings.npy'):
    embeddings = {word: model.target_embeds[idx] for word, idx in model.word_to_idx.items()}
    np.save(file_path, embeddings)
    print(f"Embeddings saved to {file_path}")

# Function to Load Word Embeddings
def load_embeddings(file_path='embeddings.npy'):
    return np.load(file_path, allow_pickle=True).item()

save_embeddings(w2v)

## Data Augmentation (Optional)

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load Processed Corpus
file = pd.read_csv('/kaggle/input/email-sns/email_spam.csv')
sequences, labels = file.email.values, file.label.values 

# Separate data by classes
minority_class_texts = [sequences[i] for i in range(len(labels)) if labels[i] == 1]
majority_class_texts = [sequences[i] for i in range(len(labels)) if labels[i] == 0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(len(majority_class_texts), len(minority_class_texts))

In [None]:
# Load translation models for back-translation
src_model_name = "Helsinki-NLP/opus-mt-en-fr"  # English to French
tgt_model_name = "Helsinki-NLP/opus-mt-fr-en"  # French to English
src_tokenizer = MarianTokenizer.from_pretrained(src_model_name)
src_model = MarianMTModel.from_pretrained(src_model_name).to(device)
tgt_tokenizer = MarianTokenizer.from_pretrained(tgt_model_name)
tgt_model = MarianMTModel.from_pretrained(tgt_model_name).to(device)

def back_translate(text):
    # Translate from English to French
    inputs = src_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad(): translated = src_model.generate(**inputs)
    french_text = src_tokenizer.decode(translated[0], skip_special_tokens=True)

    # Translate back from French to English
    inputs = tgt_tokenizer(french_text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad(): back_translated = tgt_model.generate(**inputs)
    return tgt_tokenizer.decode(back_translated[0], skip_special_tokens=True)
    
augmented_texts = []
for i, text in enumerate(minority_class_texts):
    try:
        # Apply back-translation
        back_translated = back_translate(text)
        augmented_texts.append(back_translated)
    except Exception as e:
        print(f"Error processing text: {text}, {e}")
    if i % (len(minority_class_texts)//10) == 0: print(f"{i}/{len(minority_class_texts)}")

In [None]:
# Create labels for augmented data
augmented_labels = [1] * len(augmented_texts)

# Combine original and augmented data
all_texts = sequences.tolist() + augmented_texts
all_labels = labels.tolist() + augmented_labels
pos = np.count_nonzero(all_labels)
print(pos, len(all_labels) - pos)

augmented_df = pd.DataFrame({"email": all_texts, "label": all_labels})
augmented_df.to_csv("email_spam_aug.csv", index=False)