In [1]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR / "spam-metadata.pkl"
TOKENIZER_EXPORT_PATH = EXPORT_DIR/ 'spam-tokenizer.json'
print("BASE_DIR is", BASE_DIR)

BASE_DIR is C:\Users\a\Dev\ai-api


In [2]:
df = pd.read_csv(SPAM_DATASET_PATH)

In [3]:
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [4]:
df.shape

(7528, 3)

In [5]:
labels = df['label'].to_list()
texts = df['text'].to_list()

In [6]:
label_legend = {'ham': 0, 'spam': 1}
label_legend_inverted = {f"{v}":k for k, v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [7]:
labels_as_int = [label_legend[x] for x in labels]

In [8]:
# labels_as_int

In [9]:
label_legend_inverted[str(labels_as_int[120])]

'spam'

In [10]:
random_idx = random.randint(0, len(labels))
print('Random Index', random_idx)

assert labels[random_idx] == df['label'][random_idx]
assert texts[random_idx] == df.iloc[random_idx]['text']
assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx]['label']

Random Index 3609


In [11]:
import numpy as np

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
max_num_words = 280

In [13]:
tokenizer = Tokenizer(num_words = max_num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [14]:
# sequences
# tokenizer.word_index
len(tokenizer.word_index)

12077

In [15]:
max([len(w) for w in sequences])

153

In [16]:
# print(sequences[15])

In [17]:
maxlen = 300

In [18]:
X = pad_sequences(sequences, maxlen=maxlen)

In [19]:
max([len(w) for w in X])

300

In [20]:
labels_as_int_arr = np.asarray(labels_as_int)
labels_as_int_arr

array([0, 0, 1, ..., 0, 0, 0])

In [21]:
y = to_categorical(labels_as_int_arr)
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [22]:
assert len(sequences) == len(texts) == len(labels_as_int)

In [23]:
from sklearn.model_selection import train_test_split
import pickle

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                    random_state = 2505)

In [26]:
training_data = {
    'X_train': X_train, 
    'X_test': X_test, 
    'y_train': y_train, 
    'y_test': y_test,
    'maxlen': maxlen,
    'label_legend': label_legend,
    'label_legend_inverted': label_legend_inverted,
    'tokenizer': tokenizer,
    'num_words': max_num_words
}

tokenizer_json = tokenizer.to_json()

In [27]:
print('Found %s unique tokens.' % len(tokenizer.word_index))

Found 12077 unique tokens.


In [28]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)
    
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [29]:
data = {}

with open(METADATA_EXPORT_PATH, 'rb') as f:
    data = pickle.load(f)

In [30]:
data

{'X_train': array([[  0,   0,   0, ...,  40,  10,   4],
        [  0,   0,   0, ...,   0,   0,   0],
        [  0,   0,   0, ...,  12,   1, 140],
        ...,
        [  0,   0,   0, ..., 201,  78,   9],
        [  0,   0,   0, ...,   6,  53, 224],
        [  0,   0,   0, ...,  37, 235, 117]]),
 'X_test': array([[  0,   0,   0, ...,   1,  29, 222],
        [  0,   0,   0, ...,  33,  41,  77],
        [  0,   0,   0, ...,  81, 119,  46],
        ...,
        [  0,   0,   0, ..., 168,   6, 262],
        [  0,   0,   0, ...,  98,   2,  18],
        [  0,   0,   0, ...,   3,  49,  19]]),
 'y_train': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]], dtype=float32),
 'y_test': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]], dtype=float32),
 'maxlen': 300,
 'label_legend': {'ham': 0, 'spam': 1},
 'label_legend_inverted': {'0': 'ham', '1': 'spam'},
 't