In [2]:
# Importing all necesary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
from sklearn.model_selection import train_test_split

In [3]:
#Reading the CSV file
df = pd.read_csv('spam_ham_dataset.csv')

#Check for duplicates in dataframe
df.drop_duplicates(inplace = True)

#Downloading stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akils\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#Removes punctuation from words as well as stopwords

def process_text(text):
    #1 Remoe punctuation
    #2 Remove stopwords(useless words)
    #3 return a list of clean text words

    #1 

    #List compression, where loops through each charecter to see if its a punctuation. This is still 
    #a string. 
    nopunc = [char for char in text if char not in string.punctuation]

    #Attaches a space at beggining of nopunc
    nopunc = ''.join(nopunc)

    #2 

    #Loops through every word in nopunc(split seperates text )
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean_words





In [5]:
#Defines the vectorization layer, processes training data text, and adapts it to model
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens = 45000, output_mode = "count", pad_to_max_tokens=45000)
pre_processed_text = process_text(df['text'])
vectorize_layer.adapt(pre_processed_text)

In [6]:
# Defines training and testing data, splitting 67:33
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_num'], test_size = 0.33)
print(X_train)
print(y_train)
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

lambda_values = np.asarray([0, 0.001, 0.002, 0.004, 0.008, 0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.4])
models = []

1633    Subject: phillips petroleum\r\ni wanted to upd...
3045    Subject: hpl meter # 986290 indian hills plant...
354     Subject: submision result\r\nhave you heard ?\...
4269    Subject: shelby why can ' t you call me back ?...
3965    Subject: new nomination\r\n- - - - - - - - - -...
                              ...                        
1579    Subject: we have vicodin and anything else\r\n...
1962    Subject: mobil february , 2000 activity\r\ndo ...
5080    Subject: hpl nom for august 17 , 2000\r\n( see...
2238    Subject: hpl nom for april 21 - 23 , 2001\r\n(...
557     Subject: re : inquiry ?\r\nbecky ,\r\ncan we g...
Name: text, Length: 3464, dtype: object
1633    0
3045    0
354     1
4269    1
3965    0
       ..
1579    1
1962    0
5080    0
2238    0
557     0
Name: label_num, Length: 3464, dtype: int64


In [12]:
#Building of model: takes in a string input, converts it to a vector, and makes it go through four layers

def create_new_model(lambdaValue):
    model =  tf.keras.models.Sequential(
        [
        tf.keras.Input(shape=(1,), dtype=tf.string),
        vectorize_layer,
        Dense(units = 25, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(lambdaValue)),
        Dense(units = 15, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(lambdaValue)),
        Dense(units = 10, activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(lambdaValue)),
        Dense(units = 1, activation = 'sigmoid') 

    ]
    ) 
    return model


In [13]:
# Compiles and fits the model

for i in range(len(lambda_values)):
    models.append(create_new_model(lambda_values[i]))
for model in models:
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    )
    model.fit(
        X_train, y_train,
        epochs=40
    )

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epo

In [20]:
#Training data error calculation
metrics = []
predictions = []
m_train = len(y_train)
m_test = len(y_test)
tracker = 0

for mod in models:
    train_prediction = mod.predict(X_train)
    current_train_predictions = train_prediction >= 0.5
    training_correct = 0
    for i in range(m_train):
        if current_train_predictions[i] == y_train[i]:
           training_correct = training_correct + 1
    training_percentage = training_correct/m_train
    test_prediction = mod.predict(X_test)
    current_test_prediction = test_prediction >= 0.5
    testing_correct = 0
    true_positive = 0
    false_positive = 0
    false_neg = 0
    for i in range(m_test):
        if current_test_prediction[i] == y_test[i]:
            testing_correct = testing_correct + 1
    #else:
     #   print(str(prediction[i]) + " " + str(current_predictions[i]) + " " + str(y_test[i]))
        if y_test[i] == 1:
            if(current_test_prediction[i] == 1):
                true_positive = true_positive + 1
            else: 
                false_neg = false_neg + 1
        elif y_test[i] == 0: 
            if(current_test_prediction[i] == 1):
                false_positive = false_positive + 1
    testing_percentage = testing_correct/m_test
    if true_positive + false_positive == 0:
        precision = -1
    else:
        precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_neg)
    metrics.append((lambda_values[tracker],testing_percentage, training_percentage, precision, recall))
    predictions.append(train_prediction)
    tracker = tracker + 1
    

for x in metrics:
    print(x)
    print("\n")




#print(correct)

#for i in range(m):
#    print(str(prediction[i]) + " " + str(current_predictions[i]) + " " + str(y_train[i]))

(0.0, 0.9853544229642648, 1.0, 0.9778225806451613, 0.9719438877755511)


(0.001, 0.9830111306385472, 1.0, 0.9644268774703557, 0.9779559118236473)


(0.002, 0.9830111306385472, 1.0, 0.9662698412698413, 0.9759519038076152)


(0.004, 0.9794961921499707, 0.9988452655889145, 0.9734693877551021, 0.9559118236472945)


(0.008, 0.9824253075571178, 0.9997113163972287, 0.9571150097465887, 0.9839679358717435)


(0.01, 0.9789103690685413, 0.996824480369515, 0.9548133595284872, 0.9739478957915831)


(0.02, 0.9830111306385472, 0.9971131639722863, 0.9681274900398407, 0.9739478957915831)


(0.04, 0.9794961921499707, 0.995958429561201, 0.9566929133858267, 0.9739478957915831)


(0.08, 0.9578207381370826, 0.9740184757505773, 0.9797752808988764, 0.87374749498998)


(0.1, 0.9390743995313415, 0.9627598152424942, 0.9759036144578314, 0.811623246492986)


(0.2, 0.7076742823667252, 0.7113163972286374, -1, 0.0)


(0.4, 0.7076742823667252, 0.7113163972286374, -1, 0.0)




In [36]:
print(predictions[3])

[[1.8953878e-03]
 [2.1968722e-06]
 [9.9836594e-01]
 ...
 [1.3630579e-03]
 [1.0798749e-03]
 [4.6668983e-08]]


In [92]:
#Testing data prediction

prediction = model.predict(X_test)
current_predictions = prediction >= 0.5
m = len(y_test)
correct = 0
true_positive = 0
false_positive = 0
false_neg = 0
for i in range(m):
    if current_predictions[i] == y_test[i]:
        correct = correct + 1
    else:
        print(str(prediction[i]) + " " + str(current_predictions[i]) + " " + str(y_test[i]))
    if y_test[i] == 1:
        if(current_predictions[i] == 1):
            true_positive = true_positive + 1
        else: 
            false_neg = false_neg + 1
    elif y_test[i] == 0: 
        if(current_predictions[i] == 1):
            false_positive = false_positive + 1

#print("Precision: " + str(true_positive/(true_positive + false_positive)))
#print("Recall: " + str(true_positive/(true_positive + false_neg)))
#print(correct/m)
#print(m - correct)



[0.99827325] [ True] 0
[0.8357998] [ True] 0
[0.9999998] [ True] 0
[0.8446621] [ True] 0
[0.99419993] [ True] 0
[0.02697792] [False] 1
[0.9984414] [ True] 0
[0.99901587] [ True] 0
[6.213976e-08] [False] 1
[0.98169065] [ True] 0
[0.13938086] [False] 1
[1.9749943e-06] [False] 1
[0.51889974] [ True] 0
[0.70899504] [ True] 0
[0.3506647] [False] 1
[0.00074368] [False] 1
[0.94985217] [ True] 0
[0.00724217] [False] 1
[0.8656552] [ True] 0
[1.8059904e-32] [False] 1
[2.8101109e-05] [False] 1
[0.00088663] [False] 1
[1.3561309e-08] [False] 1
[0.8948978] [ True] 0
[0.56893057] [ True] 0
[0.82568794] [ True] 0
[0.00016525] [False] 1
[0.49308693] [False] 1
[0.56937325] [ True] 0
[0.41436535] [False] 1
[1.] [ True] 0
[0.7243506] [ True] 0
[0.06220069] [False] 1
[0.95988274] [ True] 0
[0.16102728] [False] 1
[0.19289881] [False] 1
[0.9999035] [ True] 0
[0.05568629] [False] 1
[0.01268796] [False] 1
[0.06797132] [False] 1
