In [8]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as pt 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [9]:
# Load the data from spam.csv

dataset = pd.read_csv('spam.csv' , usecols=[0,1], encoding ='ISO-8859-1')
dataset.head()
len(dataset)

5572

In [10]:
# Removes the extra whitespace
dataset['labels'] = dataset['labels'].str.strip()

# Map the labels to binary values
dataset['labels'] = dataset['labels'].map({'not spam': 0, 'spam': 1})

# Convert to NumPy arrays
sentences = dataset['data'].values
labels = dataset['labels'].values

print(sentences[:10])
print(labels[:10])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
 'Even my brother is not like to speak with me. They treat me like aids patent.'
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"
 'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'
 'Had your mobil

In [11]:
# Split the data into training and testing sets

training_size = int(0.7*len(sentences))

training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]

print("Training Sentences:" , training_sentences[:5])
print("Training Labels:", training_labels[:5])
print("Testing Sentences:" , testing_sentences[:5])
print("Testing Labels:" ,  testing_labels[:5])

Training Sentences: ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]
Training Labels: [0 0 1 0 0]
Testing Sentences: ['That depends. How would you like to be treated? :)'
 'Right on brah, see you later'
 'Waiting in e car 4 my mum lor. U leh? Reach home already?'
 'Your 2004 account for 07XXXXXXXXX shows 786 unredeemed points. To claim call 08719181259 Identifier code: XXXXX Expires 26.03.05'
 'Do you want a new video handset? 750 anytime any network mins? Half Price Line Rental? Camcorder? Reply or call 08000930705 for delivery tomorrow']
Testing Labels: [0 0 0 1 1]


In [12]:
# Tokenized the training

tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(training_sentences)
#print(tokenizer.word_index)
print("Number of unique words:", len(tokenizer.word_index))


Number of unique words: 7439


In [13]:
# Create sequences of tokens which represents each sentence

train_sequences = tokenizer.texts_to_sequences(training_sentences)
test_sequences = tokenizer.texts_to_sequences(testing_sentences)

print("Training Sequences:" , train_sequences[:5])
print("Testing Sequences:" , test_sequences[:5])



Training Sequences: [[54, 426, 3510, 711, 786, 640, 65, 8, 1138, 85, 121, 309, 1297, 131, 2360, 1024, 66, 59, 3511, 136], [46, 331, 2361, 562, 6, 1817], [50, 427, 8, 22, 4, 859, 934, 2, 170, 1513, 1025, 534, 1514, 1818, 253, 1819, 71, 1513, 2, 1820, 2, 347, 427, 490, 935, 72, 443, 185, 602, 388, 2362], [6, 254, 152, 25, 320, 3512, 6, 142, 148, 55, 152], [860, 1, 92, 99, 69, 428, 2, 1139, 69, 1821, 203, 102, 491]]
Testing Sequences: [[19, 1225, 52, 172, 3, 58, 2, 29, 6154], [145, 18, 7353, 96, 3, 120], [244, 8, 131, 300, 44, 11, 775, 86, 6, 528, 371, 83, 148], [13, 1546, 356, 12, 3061, 383, 2420, 2421, 809, 2, 123, 17, 1046, 492, 6671, 1047, 732, 3145], [30, 3, 73, 4, 100, 440, 2492, 825, 826, 101, 444, 373, 350, 573, 305, 1213, 942, 88, 27, 17, 943, 12, 567, 150]]


In [14]:
# Pad the sequences to have the same length

padded_train = pad_sequences(train_sequences)

no_of_sequences = padded_train.shape[0]
length_of_sequence = padded_train.shape[1]

padded_test = pad_sequences(test_sequences, maxlen = length_of_sequence)

print("No. of training sequences:", no_of_sequences)
print("Length of training sequence:", length_of_sequence)
print("Shape of padded test sequences:",  padded_test.shape)
print(padded_train[0])



No. of training sequences: 3900
Length of training sequence: 189
Shape of padded test sequences: (1672, 189)
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   54  426 3510  711  786  640   6