In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os

In [None]:
#Read in the datasets
#ham_spam csv

ham_spam_pd = pd.read_csv('ham-spam.csv')
    
#read in wiki texts
wiki_text = []
path = './wikitext-2/'
fileList = os.listdir(path)
for i in fileList:
    #Need a way to skip the blank lines
    file = open(os.path.join('wikitext-2/'+ i), 'r')
    for line in file:
        if not line.isspace():
            wiki_text.append(line)
        continue
    


In [None]:
#SANITY CHECK
print(type(wiki_text)) #Type String
print(type(ham_spam_pd)) #Type pandas Dataframe

In [None]:
#SANITY CHECK
print(len(wiki_text))
for line in wiki_text:
    print(line)

In [None]:
#preprocessing dataset
import re
import inflect

p = inflect.engine()
stop_words: set[str] = set(stopwords.words("english"))

#Reusing code from assignment3 (Altering it because there are numbers with words and it doesn't fully change numbers to words)
def preprocessing(text_list: list[str]):
    processed_text = []
    for text in text_list:
        text = re.sub(r'[^A-Za-z0-9\s]', ' ', text) #Get rid of punctuation completely
        text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
        tokens = word_tokenize(text)
        
        #Get rid of the stop words in the wiki_text
        filtered  = []
        for word in tokens:
            word_lower = word.lower()
            #If we find any digits folowed by words
            if re.search(r'\d+', word_lower):
                #replace the digits with words
                word_lower = re.sub(r'\d+', 
                                  lambda x: p.number_to_words(x.group()), 
                                  word_lower)
                converted_tokens = word_tokenize(word_lower)
                for token in converted_tokens:
                    # 'and' often appears in number conversions so I need to look for that
                    if token not in stop_words and token not in ['and']:
                        filtered.append(token)
            else:
                if word_lower not in stop_words:
                    filtered.append(word_lower)
            
            #append the filtered words to the list
        processed_text.append(filtered)
    return processed_text
    
wiki_text_dataset: list[list[str]] = preprocessing(wiki_text)

In [None]:
#SANITY CHECK
print(type(wiki_text_dataset))
for list in wiki_text_dataset:
    print(list)

In [None]:
#Preprocessing the ham_spam dataframe
import re
def preprocess_dataframe(text):
    regex = '[^A-Za-z0-9]+'
    try:
        if not isinstance(text,str):
            text = str(text)
            
        if not text.strip():
            return []
        
        #Tokenize the words, lower the words, and remove stopwords
        tokens = word_tokenize(text)
        tokens = [word.lower() for word in tokens if isinstance(word, str) and word.isalpha()]
        tokens = [re.sub(regex,'', word) for word in tokens]
        tokens = [word for word in tokens if word not in stop_words]
        
        return tokens
    except Exception as e:
        print("Error processing {text}, Error: {e}")

ham_spam_pd["Text"] = ham_spam_pd["Text"].apply(preprocess_dataframe)

In [None]:
#SANITY CHECK
ham_spam_pd.head()

In [None]:
#Apply word2vec to the ham spam csv
import gensim
import gensim.downloader as api
import numpy as np
#I need to apply this to the second column ONLY!!!
#first column is just labels

#I used the skip-gram model because the assignment didn't specify using CBOW or Skip-gram
wv = api.load('word2vec-google-news-300')

#function to get the vectors
def get_word_vectors(text, model):
    vectors = []
    for word in text:
        #If the word can be found in the model, append it
        if word in model:
            vectors.append(model[word])
        else:
            #Otherwise, make it a 0
            vectors.append(np.zeros(model.vector_size))
    result = np.mean(vectors, axis=0) if vectors else np.zeros(model.wv.vector_size)
    return result

ham_spam_pd["average email embedding"] = ham_spam_pd["Text"].apply(lambda x: get_word_vectors(x, wv))


In [None]:
# SANITY CHECK Check to see what the size of the input is
embedding_lengths = ham_spam_pd["average email embedding"].apply(len)
print(f"Total length of email word embeddings: {embedding_lengths}")

In [None]:
import sklearn 
from sklearn.model_selection import train_test_split

#Split the dataset where the independent variable is the average email embeddings and the dependent variable is the spam classification
X_train, X_test, y_train, y_test = train_test_split(ham_spam_pd["average email embedding"], ham_spam_pd["IsSpam"], test_size=0.2,random_state=42)

#I'm getting into issues here
#Make it into a numpy array before splitting to make it easier
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
y_train = y_train.values
y_test = y_test.values

In [None]:
import numpy as np
np.random.seed(42)

class SimpleNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        #Initialization of the weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        self.dropout_rate = dropout_rate
    
    def forward(self, X, training=True):
        # Simple forward pass - no dropout initially
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = np.maximum(0, self.Z1)  # ReLU
        
        # DROPOUT HERE - Only to hidden layer (A1)
        if training and self.dropout_rate > 0:
            self.D1 = (np.random.rand(*self.A1.shape) > self.dropout_rate).astype(float)
            self.A1 = self.A1 * self.D1  # Apply dropout mask
            self.A1 = self.A1 / (1 - self.dropout_rate)  # Inverted dropout scaling
        else:
            self.D1 = np.ones_like(self.A1)  # No dropout during inference
            
        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = 1 / (1 + np.exp(-self.Z2))  # Sigmoid
        return self.A2
    
    def backward(self, X, y, learning_rate):
        m = X.shape[0]
        
        # Gradient calculations
        dZ2 = self.A2 - y.reshape(-1, 1)
        dW2 = np.dot(self.A1.T, dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m
        
        dA1 = np.dot(dZ2, self.W2.T)
        # APPLYING DROPOUT DURING BACKPROP
        if self.dropout_rate > 0:
            dA1 = dA1 * self.D1  # Only backprop through active neurons
            dA1 = dA1 / (1 - self.dropout_rate)  # Scale gradients
            
        dZ1 = dA1 * (self.Z1 > 0)  # ReLU derivative
        
        dW1 = np.dot(X.T, dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m
        
        # Update with gradient clipping
        for grad in [dW1, dW2]:
            np.clip(grad, -1.0, 1.0, out=grad)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
    
    def compute_loss(self, y_pred, y_true):
        return np.mean(-y_true * np.log(y_pred + 1e-8) - (1 - y_true) * np.log(1 - y_pred + 1e-8))
    
    def train(self, X, y, epochs, learning_rate):
        losses = []
        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward(X)
            
            # Compute loss
            loss = self.compute_loss(y_pred, y)
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, learning_rate)
            
            if epoch % 10 == 0:
                print(f"Epoch {epoch+10}: Loss = {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        y_pred = self.forward(X, training=False)
        return (y_pred > 0.5).astype(int)


In [None]:
#Training the network 
X_train_normalized = (X_train - np.mean(X_train)) / (np.std(X_train) + 1e-8)
X_test_normalized = (X_test - np.mean(X_train)) / (np.std(X_train) + 1e-8)

input_size = X_train_normalized.shape[1]
hidden_size = 128
output_size = 1

model = SimpleNeuralNetwork(input_size, hidden_size, output_size, 0.2)

#Training with dropout applied now
losses = model.train(X_train_normalized, y_train, epochs=50, learning_rate=0.01)

#Predict without dropout
y_pred = model.predict(X_test_normalized)
accuracy = np.mean(y_pred.flatten() == y_test)
print(f"Test Accuracy With Dropout Applied to model: {accuracy:.4f}")

In [None]:
# Compute precision, recall, F1
tp = np.sum((y_test == 1) & (y_pred.flatten() == 1))
fp = np.sum((y_test == 0) & (y_pred.flatten() == 1))
fn = np.sum((y_test == 1) & (y_pred.flatten() == 0))

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Final Test Metrics:")
print(f"Accuracy:  {accuracy*100:.2f}")
print(f"Precision: {precision*100:.2f}")
print(f"Recall:    {recall*100:.2f}")
print(f"F1-Score:  {f1*100:.2f}")

Model's performance 
The model's accuracy 
The precision means that the model can 
The recall means that this feedforward neural network is finding out almost all the spam compared to the non-spam emails.

The F1-score of the model, which indicates the harmonic mean between model's precision and recall, means that the neural network model.can

This is where the wiki-text dataset sent through a neural network for prediction

In [None]:
#Create a Vocab for the wiki-text dataset
#Assign a unique index to each word in the vocabulary.
def build_dataset_vocab(wikitext_dataset):
    #Keep a set of unique words in the wiki-text dataset

    unique_words = set()
    for sentence in wikitext_dataset:
        for word in sentence:
            unique_words.add(word) #This way, a set doesn't keep duplicates
    
    #Index the words in the dataset (both ways word-index, index-word)
    word_to_index = {}
    index_to_word = {}
    for i,word in enumerate(sorted(unique_words)):
        word_to_index[word] = i
        index_to_word[i] = word
    
    return word_to_index,index_to_word

In [None]:
#SANITY CHECK
word_index, index_word = build_dataset_vocab(wiki_text_dataset)
vocab = len(word_index)
print(vocab)

In [None]:
#Need a way to prepare the dataset where I have the 3 previous words and the target word
def prepare_dataset_for_nn(wiki_text_dataset, word_to_index):
    sequences = []
    targets = [] #store as indices instead of one-hot
    
    #Go through eahc list in the dataset
    for sentence in wiki_text_dataset:
        #If it's greater than 4, then get the 3 context words with the target
        if len(sentence) >= 4:
            for i in range(len(sentence) - 3):
                context = sentence[i:i+3]   #Get the words are i, i+1, i+2
                target = sentence[i+3]      #get the target word at i+3
                
                #Convert the words into numerical indices with the word index created above
                context_indices = [word_to_index.get(word,0) for word in context] #Use get() for unknown words
                target_indices = word_to_index.get(target, 0)
                
                sequences.append(context_indices)
                targets.append(target_indices)
                
    #Convert these into numpy arrays for the neural network
    X = np.array(sequences) #Shape should be (n_samples, 3)
    y = np.array(targets) #Should be (n_samples) instead of (n_samples,vocab)
    
    #make the one-hot encoded targets(word indices to binary vector) --> works, but the kernel dies because it's too large!
    #y = np.zeros((len(targets), len(word_to_index)))
    #y[np.arange(len(targets)), targets] = 1 #Shape should be (n_samples, vocab size)
                
            
    return X,y

X,y_indices = prepare_dataset_for_nn(wiki_text_dataset,word_index)
print(f"X shape: {X.shape}")         
print(f"y shape: {y_indices.shape}") 

In [None]:
X_train,X_test,y_train_indices,y_test_indices = train_test_split(X,y_indices,test_size=0.2,random_state=42)

In [None]:
np.random.seed(42)
class word_predictor_neural_network:
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size= 64, learning_rate = 0.01, epochs = 10):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        #EMBEDING of the neural network
        self.W_embed = np.random.randn(vocab_size, embedding_dim) * 0.01 #Make it the size of the vocabulary and the embedding dimension
        
        #HIDDEN LAYER of the neural network
        #Make a hidden layer where the input is 3 * the embedding dimentsion after concatenation
        self.W1 = np.random.randn(3 * embedding_dim, hidden_dim) * 0.01
        #biases of the hidden layer
        self.b1 = np.zeros((1, hidden_dim))
        
        #OUTPUT LAYER of the neural network
        #The output weights are the hidden dimension and the size of the vocab
        self.W2 = np.random.rand(hidden_dim, vocab_size) * 0.01
        #Biases of the output weight where it's the size of the dataset's vocab
        self.b2 = np.zeros((1,vocab_size))
    
    def one_hot_encoding(self, X_indices):
        #Convert batch of word indices into one-hot vectors
        batch_size = X_indices.shape[0]
        X_one_hot = np.zeros((batch_size, 3, self.vocab_size))
        #Go through the batches
        for i in range(batch_size):
            for j in range(3):
                word_idx = X_indices[i,j]
                X_one_hot[i, j, word_idx] = 1
        return X_one_hot
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x,axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def compute_loss(self, y, probs):
        #Compute the loss function(cross-entrpy)
        #Add 1e-8 for numerical stability with np.log()
        return -np.mean(np.sum(y * np.log(probs + 1e-8), axis=1))
    
    def backpropagation(self, X_indices, y, probs):
        batch_size = X_indices.shape[0]
        
        #Calculate the gradients for the output layer
        dz2 = probs - y #dL/dz2
        dW2 = np.dot(self.a1.T, dz2)/batch_size
        db2 = np.sum(dz2, axis=0, keepdims=True) / batch_size
        
        #Backpropagation for the hidden layer
        da1 = np.dot(dz2, self.W2.T) #Dot product of dz and the output weights(transposed)
        dz1 = da1 * (self.z1 > 0) #This is necessary for the ReLU activation
        dW1 = np.dot(self.concatenated.T, dz1)
        db1 = np.sum(dz1, axis=1, keepdims=True)/batch_size
        
        #Backpropagation through the concatenated and embedded layer
        d_concatenated = np.dot(dz1, self.W1.T) #dL/d(concatenated)
        d_embeddings = d_concatenated.reshape(batch_size, 3,self.embedding_dim)
        
        #Backprop through the embedding layer to update W_embd
        X_one_hot = self.one_hot_encoding(X_indices)
        #Zero initially
        dW_embed =  np.zeros_like(self.W_embed)
        
        #Gradient for the embedding matrix of the one-hot vectors and embedding gradient
        for i in range(batch_size):
            for j in range(3):
                dW_embed = dW_embed + np.outer(X_one_hot[i,j], d_embeddings[i,j])
                
        dW_embed = dW_embed/batch_size
        
        #Update the weights and the biases(parameters)
        self.W_embed = self.W_embed - self.learning_rate * dW_embed
        self.W1 = self.W1 - self.learning_rate * dW1
        self.b1 = self.b1 - self.learning_rate * db1
        self.W2 = self.W2 - self.learning_rate * dW2
        self.b2 = self.b2 - self.learning_rate * db2
        return 
    
    def forward(self, X_indices):
        batch_size = X_indices.shape[0]
        
        #Get the one-hot encoding for the forward feeding
        X_one_hot = self.one_hot_encoding(X_indices)
        
        #Embedding Layer - multiplying the one-hot vector with the embedding dimension
        #Each one-hot vector (1,vocab_size) * (vocab_size,embedding_dim) = (1,embedding_dim)
        #make the embeddings which is the batch
        self.embeddings = np.zeros((batch_size, 3, self.embedding_dim))
        for i in range(batch_size):
            for j in range(3):
                # Matrix multiplication: one-hot vector * embedding matrix
                self.embeddings[i, j] = np.dot(X_one_hot[i, j], self.W_embed)
                
        #Concatenate the 3 embeddings into a single vector
        self.concatenated = self.embeddings.reshape(batch_size, -1) #Shape should be [batch_size, 3 * embedding_dim]
        
        #Hidden layer with ReLU
        #dot product of the hidden weights and the concatenated vectors + the bias of the hidden layer
        self.z1 = np.dot(self.concatenated, self.W1) + self.b1
        self.a1 = np.maximum(0, self.z1)  # ReLU activation
        
        # Output Layer with softmax
        #dot product of the activation and the output weights with the bias of output
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        #Use the softmax function above to get the best probability
        probs = self.softmax(self.z2)
        return probs
    
    #Model training
    def train_model(self, X, y_indices):
        losses = []
        #Get n_samples, X's 1
        n_samples = X.shape[0]
        
        #Now training with target indices instead of one-hot vectors
        for epoch in range(self.epochs):
            epoch_loss = 0
            epoch_accuracy = 0
            num_batches = 0
            for i in range(0, n_samples, self.batch_size):
                end_idx = min(i + self.batch_size, n_samples)
                X_batch = X[i:end_idx]
                y_batch_indices = y_indices[i:end_idx]
                
                # Convert ONLY this batch to one-hot (much smaller memory footprint)
                y_batch_onehot = np.zeros((len(X_batch), self.vocab_size))
                y_batch_onehot[np.arange(len(X_batch)), y_batch_indices] = 1
                
                #Forward pass
                probs = self.forward(X_batch)
                batch_loss = self.compute_loss(y_batch_onehot, probs)
                
                # Calculate accuracy with the batches
                pred_indices = np.argmax(probs, axis=1)
                batch_accuracy = np.mean(pred_indices == y_batch_indices)
                
                epoch_loss += batch_loss
                epoch_accuracy += batch_accuracy
                num_batches += 1
            
                #backpropagation
                self.backpropagation(X_batch, y_batch_onehot, probs)
                
            #Compute th eaverage loss by the epoch / number of batches
            avg_loss = epoch_loss / num_batches
            #Compute the average accuracy with the epoch accuracy / batches
            avg_accuracy = epoch_accuracy/ num_batches
            #append the average loss to the list of losses
            losses.append(avg_loss)
            print(f"EPOCH: {epoch +1} | Training Loss = {avg_loss:.4f} | Average Accuracy = {avg_accuracy:.4f} ")
        
        return losses
    
    def prediction(self, X_test_indices, index_to_word, top_k=1):
        if len(X_test.shape) == 1:
            X_test_indices = X_test_indices.reshape(1,-1)
            
        all_probs = self.forward(X_test_indices)
        all_predictions = []
        
        for i in range(len(X_test_indices)):
            probs = all_probs[i]
            top_k_indices = np.argsort(probs)[-top_k:][::-1]
            predictions = [(index_to_word[idx], probs[idx]) for idx in top_k_indices]
            all_predictions.append(predictions)
        #If the input was single context
        if len(all_predictions) == 1:
            return all_predictions[0] #Return the first eleemnt of all_predictions
        else:
            return all_predictions #Return all the predictions

In [None]:
#Finds the perplexity of the neural network   
def calculate_cross_entropy_loss(y_true, y_pred_probs):
    """Calculate the cross entropy loss for the y_true against the y_prediction probabilities"""
    #For each sample, -sum(y_true * log(y_pred)) 
    #add 1e-8 for numerical stability(small values)
    losses = -np.sum(y_true * np.log(y_pred_probs+1e-8), axis=1)
    #Get teh average loss out of all the losses calculated 
    avg_loss = np.mean(losses)
    return avg_loss

def calculate_perplexity(word_pred_model, X_test, y_test_indices, vocab_size):
    """ Calculate the perplexity of the neural network model"""
    #Get the probabilities from the model's forward function using the target indices
    batch_size = 1000
    n_samples = X_test.shape[0]
    total_loss = 0
    
    #Process this in batches to avoid memory issues
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        X_batch = X_test[i:end_idx]
        y_batch_indices = y_test_indices[i:end_idx]
        
        #Convert the batches into one-hot
        y_batch_onehot = np.zeros((len(X_batch), vocab_size))
        y_batch_onehot[np.arange(len(X_batch)), y_batch_indices] = 1
        
        #Forward pass
        probs = word_pred_model.forward(X_batch)
        batch_loss = calculate_cross_entropy_loss(y_batch_onehot, probs)
        total_loss += batch_loss * (end_idx - i)
    
    #Compute the average loss by the total loss/ n-samples
    avg_loss = total_loss / n_samples
    model_perplexity = np.exp(avg_loss)
    return model_perplexity, avg_loss

def calculate_accuracy(model, X, y_indices):
    """
    Calculate accuracy using the target indices
    """
    # Get predictions for all test examples
    probs = model.forward(X)
    pred_indices = np.argmax(probs, axis=1)
    accuracy = np.mean(pred_indices == y_indices)
    return accuracy

In [None]:
#Run the words through the model
wp_model = word_predictor_neural_network(vocab_size=vocab,embedding_dim=50, hidden_dim=128)
losses = wp_model.train_model(X_train, y_train_indices)

# Calculate perplexity and accuracy
test_perplexity, test_loss = calculate_perplexity(wp_model, X_test, y_test_indices, vocab_size=vocab)
test_accuracy = calculate_accuracy(wp_model, X_test, y_test_indices)

print("=== NEURAL NETWORK WORD PREDICTION EVALUATION ===")
print(f"Test Cross-Entropy Loss: {test_loss:.4f}")
print(f"Test Perplexity: {test_perplexity:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")