# Spam Classifier with Keras


In [None]:
import os
import pathlib
import pandas as pd
import pickle

In [None]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
EXPORT_DIR = pathlib.Path('/datasets/exports/')
GUIDES_DIR = pathlib.Path("/guides/spam-classifier/")
DATASET_CSV_PATH = EXPORT_DIR / 'spam-dataset.csv'
TRAINING_DATA_PATH = EXPORT_DIR / 'spam-training-data.pkl'
PART_TWO_GUIDE_PATH = GUIDES_DIR / "2 - Convert Dataset into Vectors.ipynb"

In [None]:
!mkdir -p "$EXPORT_DIR"
!mkdir -p "$GUIDES_DIR"
!curl "https://raw.githubusercontent.com/codingforentrepreneurs/AI-as-an-API/main/datasets/exports/spam-dataset.csv" -o "$DATASET_CSV_PATH"
!curl "https://raw.githubusercontent.com/codingforentrepreneurs/AI-as-an-API/main/guides/spam-classifier/2%20-%20Convert%20Dataset%20into%20Vectors.ipynb" -o "$PART_TWO_GUIDE_PATH"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  729k  100  729k    0     0  2023k      0 --:--:-- --:--:-- --:--:-- 2021k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 15408  100 15408    0     0  61431      0 --:--:-- --:--:-- --:--:-- 61632


In [None]:
df = pd.read_csv(DATASET_CSV_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",uci-spam-sms
1,ham,Ok lar... Joking wif u oni...,uci-spam-sms
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,uci-spam-sms
3,ham,U dun say so early hor... U c already then say...,uci-spam-sms
4,ham,"Nah I don't think he goes to usf, he lives aro...",uci-spam-sms


In [None]:
%run "$PART_TWO_GUIDE_PATH"

BASE_DIR is /
Random Index 4319
Found 9538 unique tokens.


In [None]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)

## Transform Extracted Dataset

In [None]:
X_test = data['X_test']
X_train = data['X_train']
y_test = data['y_test']
y_train = data['y_train']
labels_legend_inverted = data['labels_legend_inverted']
legend = data['legend']
max_sequence = data['max_sequence']
max_words = data['max_words']
tokenizer = data['tokenizer']

## Create our LSTM Model

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 280, 128)          35840     
                                                                 
 spatial_dropout1d (Spatial  (None, 280, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 291034 (1.11 MB)
Trainable params: 291034 (1.11 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
batch_size = 32
epochs = 1
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs)



<keras.src.callbacks.History at 0x7842e6f8bf70>

In [None]:
MODEL_EXPORT_PATH = 'spam-model.h5'
model.save(str(MODEL_EXPORT_PATH))

  saving_api.save_model(


## Predict new data

In [None]:
import numpy as np

def predict(text_str, max_words=280, max_sequence = 280, tokenizer=None):
  if not tokenizer:
    return None
  sequences = tokenizer.texts_to_sequences([text_str])
  x_input = pad_sequences(sequences, maxlen=max_sequence)
  y_output = model.predict(x_input)
  top_y_index = np.argmax(y_output)
  preds = y_output[0]
  labeled_preds = [{f"{labels_legend_inverted[str(i)]}": x} for i, x in enumerate(preds)]
  return labeled_preds

In [None]:
predict("i am spam", max_words=max_words, max_sequence=max_sequence, tokenizer=tokenizer)