In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os

In [2]:
#Read in the datasets
#ham_spam csv

ham_spam_pd = pd.read_csv('ham-spam.csv')
    
#read in wiki texts
wiki_text = []
path = './wikitext-2/'
fileList = os.listdir(path)
for i in fileList:
    #Need a way to skip the blank lines
    file = open(os.path.join('wikitext-2/'+ i), 'r')
    for line in file:
        if not line.isspace():
            wiki_text.append(line)
        continue
    


In [None]:
#SANITY CHECK
print(type(wiki_text)) #Type String
print(type(ham_spam_pd)) #Type pandas Dataframe

In [None]:
#SANITY CHECK
print(len(wiki_text))
for line in wiki_text:
    print(line)

In [3]:
#preprocessing dataset
import re
import inflect

p = inflect.engine()
stop_words: set[str] = set(stopwords.words("english"))

#Reusing code from assignment3
def preprocessing(text_list: list[str]):
    processed_text = []
    for text in text_list:
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        tokens = word_tokenize(text)
        
        #Get rid of the stop words in the wiki_text
        filtered  = []
        for word in tokens:
            word_lower = word.lower()
            if word_lower.isdigit():
                word_lower = p.number_to_words(word_lower)
            if word_lower not in stop_words:
                filtered.append(word_lower)
            
            #append the filtered words to the list
        processed_text.append(filtered)
    return processed_text
    
wiki_text_dataset: list[list[str]] = preprocessing(wiki_text)

In [None]:
#SANITY CHECK
print(type(wiki_text_dataset))
for list in wiki_text_dataset:
    print(list)

In [4]:
#Preprocessing the ham_spam dataframe
import re
def preprocess_dataframe(text):
    regex = '[^A-Za-z0-9]+'
    try:
        if not isinstance(text,str):
            text = str(text)
            
        if not text.strip():
            return []
        
        #Tokenize the words, lower the words, and remove stopwords
        tokens = word_tokenize(text)
        tokens = [word.lower() for word in tokens if isinstance(word, str) and word.isalpha()]
        tokens = [re.sub(regex,'', word) for word in tokens]
        tokens = [word for word in tokens if word not in stop_words]
        
        return tokens
    except Exception as e:
        print("Error processing {text}, Error: {e}")

ham_spam_pd["Text"] = ham_spam_pd["Text"].apply(preprocess_dataframe)

In [None]:
#SANITY CHECK
ham_spam_pd.head()

In [5]:
#Apply word2vec to the ham spam csv
import gensim
import gensim.downloader as api
import numpy as np
#I need to apply this to the second column ONLY!!!
#first column is just labels

#I used the skip-gram model because the assignment didn't specify using CBOW or Skip-gram
wv = api.load('word2vec-google-news-300')

#function to get the vectors
def get_word_vectors(text, model):
    vectors = []
    for word in text:
        #If the word can be found in the model, append it
        if word in model:
            vectors.append(model[word])
        else:
            #Otherwise, make it a 0
            vectors.append(np.zeros(model.vector_size))
    result = np.mean(vectors, axis=0) if vectors else np.zeros(model.wv.vector_size)
    return result

ham_spam_pd["average email embedding"] = ham_spam_pd["Text"].apply(lambda x: get_word_vectors(x, wv))


In [None]:
# Check to see what the size of the input is
embedding_lengths = ham_spam_pd["average email embedding"].apply(len)
print(f"Total length of email word embeddings: {embedding_lengths}")

In [6]:
import sklearn 
from sklearn.model_selection import train_test_split

#Split the dataset where the independent variable is the average email embeddings and the dependent variable is the spam classification
X_train, X_test, y_train, y_test = train_test_split(ham_spam_pd["average email embedding"], ham_spam_pd["IsSpam"], test_size=0.2,random_state=42)

#I'm getting into issues here
#Make it into a numpy array before splitting to make it easier
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
y_train = y_train.values
y_test = y_test.values

In [None]:
import numpy as np
np.random.seed(42)

class SimpleNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        #Initialization of the weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        self.dropout_rate = dropout_rate
    
    def forward(self, X, training=True):
        # Simple forward pass - no dropout initially
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = np.maximum(0, self.Z1)  # ReLU
        
        # DROPOUT HERE - Only to hidden layer (A1)
        if training and self.dropout_rate > 0:
            self.D1 = (np.random.rand(*self.A1.shape) > self.dropout_rate).astype(float)
            self.A1 = self.A1 * self.D1  # Apply dropout mask
            self.A1 = self.A1 / (1 - self.dropout_rate)  # Inverted dropout scaling
        else:
            self.D1 = np.ones_like(self.A1)  # No dropout during inference
            
        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = 1 / (1 + np.exp(-self.Z2))  # Sigmoid
        return self.A2
    
    def backward(self, X, y, learning_rate):
        m = X.shape[0]
        
        # Gradient calculations
        dZ2 = self.A2 - y.reshape(-1, 1)
        dW2 = np.dot(self.A1.T, dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m
        
        dA1 = np.dot(dZ2, self.W2.T)
        # APPLYING DROPOUT DURING BACKPROP
        if self.dropout_rate > 0:
            dA1 = dA1 * self.D1  # Only backprop through active neurons
            dA1 = dA1 / (1 - self.dropout_rate)  # Scale gradients
            
        dZ1 = dA1 * (self.Z1 > 0)  # ReLU derivative
        
        dW1 = np.dot(X.T, dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m
        
        # Update with gradient clipping
        for grad in [dW1, dW2]:
            np.clip(grad, -1.0, 1.0, out=grad)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
    
    def compute_loss(self, y_pred, y_true):
        return np.mean(-y_true * np.log(y_pred + 1e-8) - (1 - y_true) * np.log(1 - y_pred + 1e-8))
    
    def train(self, X, y, epochs, learning_rate):
        losses = []
        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward(X)
            
            # Compute loss
            loss = self.compute_loss(y_pred, y)
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, learning_rate)
            
            if epoch % 10 == 0:
                print(f"Epoch {epoch+10}: Loss = {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        y_pred = self.forward(X, training=False)
        return (y_pred > 0.5).astype(int)


In [None]:
#Training the network 
X_train_normalized = (X_train - np.mean(X_train)) / (np.std(X_train) + 1e-8)
X_test_normalized = (X_test - np.mean(X_train)) / (np.std(X_train) + 1e-8)

input_size = X_train_normalized.shape[1]
hidden_size = 128
output_size = 1

model = SimpleNeuralNetwork(input_size, hidden_size, output_size, 0.2)

#Training with dropout applied now
losses = model.train(X_train_normalized, y_train, epochs=50, learning_rate=0.01)

#Predict without dropout
y_pred = model.predict(X_test_normalized)
accuracy = np.mean(y_pred.flatten() == y_test)
print(f"Test Accuracy With Dropout Applied to model: {accuracy:.4f}")

=== TRAINING MINIMAL NETWORK ===
Epoch 10: Loss = 0.8895
Epoch 20: Loss = 0.8072
Epoch 30: Loss = 0.8563
Epoch 40: Loss = 0.9113
Epoch 50: Loss = 0.9662
Test Accuracy: 0.8700


In [9]:
# Compute precision, recall, F1
tp = np.sum((y_test == 1) & (y_pred.flatten() == 1))
fp = np.sum((y_test == 0) & (y_pred.flatten() == 1))
fn = np.sum((y_test == 1) & (y_pred.flatten() == 0))

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Final Test Metrics:")
print(f"Accuracy:  {accuracy*100:.2f}")
print(f"Precision: {precision*100:.2f}")
print(f"Recall:    {recall*100:.2f}")
print(f"F1-Score:  {f1*100:.2f}")

Final Test Metrics:
Accuracy:  87.00
Precision: 91.49
Recall:    82.69
F1-Score:  86.87


Model's performance 
The model's accuracy 
The precision means that the model can 
The recall means that this feedforward neural network is finding out almost all the spam compared to the non-spam emails.

The F1-score of the model, which indicates the harmonic mean between model's precision and recall, means that the neural network model.can

This is where the wiki-text dataset sent through a neural network for prediction

In [None]:
#Create a Vocab for the wiki-text dataset
#Assign a unique index to each word in the vocabulary.
def build_dataset_vocav(wikitext_dataset):
    #Keep a set of unique words in the wiki-text dataset
    unique_words = set()
    for sentence in wikitext_dataset:
        for word in sentence:
            unique_words.add(word) #This way, a set doesn't keep duplicates
    
    #Index the words in the dataset
    word_to_index = {}
    for i,word in enumerate(unique_words):
        word_to_index[word] = i
        
    #create one-hot encoding vectors for the neural network
    one_hot_vectors = []
     
    
    
    return 

In [None]:
#Split the dataset
X_train,X_test,y_train,y_test = train_test_split(,test_size=0.2,random_state=42)

In [None]:
class neural_network:
    def __init__(self, batch_size = 64, learning_rate = 0.01, epochs = 10):
        self.batch = batch_size
        self.learning_rate = learning_rate
        self.epoch = epochs
        
    def softmax():
        return 0
    
    #Model training
    def train_model(self, X_train, y_train,):
        return 0


In [None]:
#Finds the perplexity of the neural network   
def perplexity():
    model_perplexity = 0
    return model_perplexity