In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os

In [2]:
#Read in the datasets
#ham_spam csv

ham_spam_pd = pd.read_csv('ham-spam.csv')
    
#read in wiki texts
wiki_text = []
path = './wikitext-2/'
fileList = os.listdir(path)
for i in fileList:
    #Need a way to skip the blank lines
    file = open(os.path.join('wikitext-2/'+ i), 'r')
    for line in file:
        if not line.isspace():
            wiki_text.append(line)
        continue
    


In [None]:
#SANITY CHECK
print(type(wiki_text)) #Type String
print(type(ham_spam_pd)) #Type pandas Dataframe

In [None]:
#SANITY CHECK
print(len(wiki_text))
for line in wiki_text:
    print(line)

In [3]:
#preprocessing dataset
import re
import inflect

p = inflect.engine()
stop_words: set[str] = set(stopwords.words("english"))

#Reusing code from assignment3
def preprocessing(text_list: list[str]):
    processed_text = []
    for text in text_list:
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
        tokens = word_tokenize(text)
        
        #Get rid of the stop words in the wiki_text
        filtered  = []
        for word in tokens:
            word_lower = word.lower()
            if word_lower.isdigit():
                word_lower = p.number_to_words(word_lower)
            if word_lower not in stop_words:
                filtered.append(word_lower)
            
            #append the filtered words to the list
        processed_text.append(filtered)
    return processed_text
    
wiki_text_dataset: list[list[str]] = preprocessing(wiki_text)

In [None]:
#SANITY CHECK
print(type(wiki_text_dataset))
for list in wiki_text_dataset:
    print(list)

In [4]:
#Preprocessing the ham_spam dataframe
import re
def preprocess_dataframe(text):
    regex = '[^A-Za-z0-9]+'
    try:
        if not isinstance(text,str):
            text = str(text)
            
        if not text.strip():
            return []
        
        #Tokenize the words, lower the words, and remove stopwords
        tokens = word_tokenize(text)
        tokens = [word.lower() for word in tokens if isinstance(word, str) and word.isalpha()]
        tokens = [re.sub(regex,'', word) for word in tokens]
        tokens = [word for word in tokens if word not in stop_words]
        
        return tokens
    except Exception as e:
        print("Error processing {text}, Error: {e}")

ham_spam_pd["Text"] = ham_spam_pd["Text"].apply(preprocess_dataframe)

In [None]:
#SANITY CHECK
ham_spam_pd.head()

In [5]:
#Apply word2vec to the ham spam csv
import gensim
import gensim.downloader as api
import numpy as np
#I need to apply this to the second column ONLY!!!
#first column is just labels

#I used the skip-gram model because the assignment didn't specify using CBOW or Skip-gram
wv = api.load('word2vec-google-news-300')

#function to get the vectors
def get_word_vectors(text, model):
    vectors = []
    for word in text:
        #If the word can be found in the model, append it
        if word in model:
            vectors.append(model[word])
        else:
            #Otherwise, make it a 0
            vectors.append(np.zeros(model.vector_size))
    result = np.mean(vectors, axis=0) if vectors else np.zeros(model.wv.vector_size)
    return result

ham_spam_pd["average email embedding"] = ham_spam_pd["Text"].apply(lambda x: get_word_vectors(x, wv))


In [None]:
# Check to see what the size of the input is
embedding_lengths = ham_spam_pd["average email embedding"].apply(len)
print(f"Total length of email word embeddings: {embedding_lengths}")

In [6]:
import sklearn 
from sklearn.model_selection import train_test_split

#Split the dataset where the independent variable is the average email embeddings and the dependent variable is the spam classification
X_train, X_test, y_train, y_test = train_test_split(ham_spam_pd["average email embedding"], ham_spam_pd["IsSpam"], test_size=0.2,random_state=42)

#I'm getting into issues here
#Make it into a numpy array before splitting to make it easier
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
y_train = y_train.values
y_test = y_test.values

In [None]:
import numpy as np
np.random.seed(42)

class SimpleNeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # He initialization
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
    
    def forward(self, X, training=True):
        # Simple forward pass - no dropout initially
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = np.maximum(0, self.Z1)  # ReLU
        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = 1 / (1 + np.exp(-self.Z2))  # Sigmoid
        return self.A2
    
    def backward(self, X, y, learning_rate):
        m = X.shape[0]
        
        # Gradient calculations
        dZ2 = self.A2 - y.reshape(-1, 1)
        dW2 = np.dot(self.A1.T, dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m
        
        dA1 = np.dot(dZ2, self.W2.T)
        dZ1 = dA1 * (self.Z1 > 0)  # ReLU derivative
        
        dW1 = np.dot(X.T, dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m
        
        # Update with gradient clipping
        for grad in [dW1, dW2]:
            np.clip(grad, -1.0, 1.0, out=grad)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
    
    def compute_loss(self, y_pred, y_true):
        return np.mean(-y_true * np.log(y_pred + 1e-8) - (1 - y_true) * np.log(1 - y_pred + 1e-8))
    
    def train(self, X, y, epochs, learning_rate):
        losses = []
        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward(X)
            
            # Compute loss
            loss = self.compute_loss(y_pred, y)
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, learning_rate)
            
            if epoch % 10 == 0:
                print(f"Epoch {epoch}: Loss = {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        y_pred = self.forward(X, training=False)
        return (y_pred > 0.5).astype(int)


In [None]:
#Training the network 
X_train_normalized = (X_train - np.mean(X_train)) / (np.std(X_train) + 1e-8)
X_test_normalized = (X_test - np.mean(X_train)) / (np.std(X_train) + 1e-8)

# 1. Create simple network
input_size = X_train_normalized.shape[1]
hidden_size = 64  # Start small
output_size = 1

model = SimpleNeuralNetwork(input_size, hidden_size, output_size)

# 2. Train with conservative parameters
print("=== TRAINING MINIMAL NETWORK ===")
losses = model.train(X_train_normalized, y_train, epochs=50, learning_rate=0.01)

# 3. Evaluate
y_pred = model.predict(X_test_normalized)
accuracy = np.mean(y_pred.flatten() == y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 0: Gradient norm = 3.687358
Weight change: 0.000387
Epoch:1 | training_loss:1.424614480793975
Epoch 1: Gradient norm = 7.181062
Weight change: 0.000588
Epoch:2 | training_loss:2.0530370217919645
Epoch 2: Gradient norm = 10.123245
Weight change: 0.000590
Epoch:3 | training_loss:2.0741169496541785
Epoch 3: Gradient norm = 6.772403
Weight change: 0.002818
Epoch:4 | training_loss:0.9329015911844468
Epoch 4: Gradient norm = 2.091256
Weight change: 0.001115
Epoch:5 | training_loss:1.0040384277400725
Epoch 5: Gradient norm = 1.082719
Weight change: 0.000909
Epoch:6 | training_loss:0.999381395054661
Epoch 6: Gradient norm = 0.724196
Weight change: 0.001109
Epoch:7 | training_loss:1.057331860836463
Epoch 7: Gradient norm = 0.609772
Weight change: 0.001028
Epoch:8 | training_loss:1.0935649063398913
Epoch 8: Gradient norm = 0.563113
Weight change: 0.001042
Epoch:9 | training_loss:1.1396612403052986
Epoch 9: Gradient norm = 0.532981
Weight change: 0.001011
Epoch:10 | training_loss:1.18053796

In [19]:
def compute_tp_fp_tn_fn(y_true, y_prediction):
    tp = np.sum((y_true == 1) & (y_prediction == 1))
    fp = np.sum((y_true == 0) & (y_prediction == 1))
    tn = np.sum((y_true == 0) & (y_prediction == 0))
    fn = np.sum((y_true == 1) & (y_prediction == 0))
    return tp, fp, tn, fn

def accuracy(tp, fp, tn, fn):
    
    return ((tp + tn))/(tp + fp + tn + fn)

def precision(tp, fp):
    return tp /(tp + fp)
    
def recall(tp, fn):
    return tp/ (tp + fn)

def f1_score(p, r):
    return (2 * p * r)/(p + r)
    

In [20]:
true_positive, false_positive, true_negative, false_negative = compute_tp_fp_tn_fn(y_true, fnn_predict)
model_accuracy = accuracy(true_positive, false_positive, true_negative, false_negative)
model_precision = precision(true_positive, false_positive)
model_recall = recall(true_positive, false_negative)
model_f1_score = f1_score(model_precision, model_recall)

print(f"MODEL'S ACCURACY: {model_accuracy * 100:.2f}")
print(f"MODEL'S PRECISION: {model_precision * 100:.2f}")
print(f"MODEL'S RECALL: {model_recall * 100:.2f}")
print(f"MODEL'S F1 SCORE: {model_f1_score * 100:.2f}")

MODEL'S ACCURACY: 48.22
MODEL'S PRECISION: 52.00
MODEL'S RECALL: 5.50
MODEL'S F1 SCORE: 9.95


This is where the wiki-text dataset sent through a neural network for prediction

In [None]:
#Create a Vocab for the wiki-text dataset


In [None]:
#Split the dataset

In [None]:
class neural_network:
    def __init__(self, batch_size = 64, learning_rate = 0.01, epochs = 10):
        self.batch = batch_size
        self.learning_rate = learning_rate
        self.epoch = epochs
        
    def softmax():
        return 0
    #Model training
    def train_model():
        return 0
    #Finds the perplexity of the neural network   
    def perplexity():
        model_perplexity = 0
        return model_perplexity