In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer   # Term Frequency - Inverse Document Frequency
import spacy
import tensorflow as tf
nlp = spacy.load('en_core_web_sm')

data = pd.read_csv(
    "/content/drive/MyDrive/Datasets/spam_ham_dataset.csv",
    #on_bad_lines=False,
    engine="python"     # ParserError: Error tokenizing data. C error: EOF inside string starting at row 1989
)

df = pd.DataFrame(data)
df.dropna()

X = df["text"]
y = df["label_num"]


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# X_train, y_train

X

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [None]:
# 1. Lower
X = X.str.lower()

X

0       subject: enron methanol ; meter # : 988291\r\n...
1       subject: hpl nom for january 9 , 2001\r\n( see...
2       subject: neon retreat\r\nho ho ho , we ' re ar...
3       subject: photoshop , windows , office . cheap ...
4       subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    subject: put the 10 on the ft\r\nthe transport...
5167    subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    subject: calpine daily gas nomination\r\n>\r\n...
5169    subject: industrial worksheets for august 2000...
5170    subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [None]:
# 2. Remove special characters
spl_chars_removed = []
for sentence in X:
  spl_chars_removed.append(re.sub('[^A-Za-z0-9+]', " ", sentence))
X = spl_chars_removed

X[0]


'subject  enron methanol   meter     988291  this is a follow up to the note i gave you on monday   4   3   00   preliminary  flow data provided by daren      please override pop   s daily volume   presently zero   to reflect daily  activity you can obtain from gas control    this change is needed asap for economics purposes  '

In [None]:
nlp('the')[0].is_stop
nlp('   ')[0].is_space

True

In [None]:
# 4. Lemmatize with Remove stopwords
import spacy
nlp = spacy.load('en_core_web_sm')

tokenized_emails = []
for text in X:
  doc = nlp(text)   # for each email
  tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_space)]

  tokenized_emails.append(" ".join(tokens))

X = tokenized_emails


'\n  Lemmatization not required for email classification because the Model is going to predict on  certian keywords\n  The test data is going to be lemmatized too, so lemmatization may help. Its required.\n'

In [None]:
nlp2 = spacy.blank('en')
doc2 = nlp2('I am fred eat\'s')

vocab = [token.text for token in nlp('I am fred let"s')]
vocab

['I', 'am', 'fred', 'let"s']

TF-IDF vectorizer takes the vocabulary into account and calculates the Word Vector

In [None]:
# 5. Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)    # vectorizer.transform(x) can be used for already fitted Vectorizer


X_tfidf_array = X_tfidf.toarray()
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_array, y, test_size=0.1, random_state=42)


(5171, 45991)
(4653, 45991)
[0 0 0 ... 0 0 1]


In [None]:
X_train.shape, y_train.shape         # (5171, 46161)  - where 46161 is the size of the vocabulary

# for token in nlp(X[0]):
#   print(token, token.pos_)

((4653, 45991), (4653,))

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(45991,)),
    # tf.keras.layers.Dense(512),
    # tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])


model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)


history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# print(X_test.shape)
# print(X_test.shape[0])



###Model prediction function

In [None]:
def model_predictions(model, X_predict):
  y_preds = model.predict(X_predict).reshape((X_predict.shape[0], ))  # the rows - number of emails
  preds_spam_ham = ['spam' if prob==1 else 'ham' for prob in np.round(y_preds)]

  return preds_spam_ham


test_emails = vectorizer.transform(['Dinner tonight in my house', 'free money lottery coupons now', 'youve won the lottery click the link', 'Breakfast today in John\'s house', 'The principal asked us to leave']).toarray()
model_predictions(model, test_emails)



['ham', 'spam', 'spam', 'ham', 'ham']

In [None]:
predictions = model_predictions(model, X_test)
print(predictions)

['ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham'