In [1]:
import numpy as np
np.set_printoptions(precision=2)
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [2]:
sms_data = pd.read_csv("sms.tsv",names = ['label','message'],sep = '\t')
sms_data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Quick way of converting string label to numeric targets
sms_data['target'] = sms_data.label.map({'ham':0, 'spam':1})

In [4]:
sms_data.head()

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
X = sms_data.message
y = sms_data.target

print(X.shape, y.shape)

(5572,) (5572,)


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,stratify = y,random_state =42)

In [8]:
kTokernizer = keras.preprocessing.text.Tokenizer()

In [9]:
kTokernizer.fit_on_texts(X_train)

In [10]:
print(kTokernizer.word_counts)

OrderedDict([('he', 147), ('will', 308), ('you', 1690), ('guys', 30), ('close', 14), ('can', 312), ('i', 1878), ('please', 108), ('come', 187), ('up', 248), ('now', 406), ('imin', 1), ('town', 25), ('dontmatter', 1), ('if', 304), ('urgoin', 1), ('outl8r', 1), ('just', 287), ('reallyneed', 1), ('2docd', 1), ('dontplease', 1), ('dontignore', 1), ('mycalls', 1), ('u', 939), ('no', 276), ('thecd', 1), ('isv', 1), ('important', 22), ('tome', 1), ('4', 264), ('2moro', 3), ('ok', 220), ('k', 123), ('sry', 3), ('knw', 13), ('2', 414), ('siva', 2), ('tats', 1), ('y', 38), ('askd', 9), ("i'll", 138), ('see', 121), ('but', 346), ('prolly', 1), ('yeah', 77), ('swing', 12), ('by', 146), ('in', 731), ('a', 1147), ('bit', 40), ('got', 207), ('some', 98), ('things', 40), ('to', 1787), ('take', 120), ('care', 60), ('of', 476), ('here', 95), ('firsg', 1), ('shall', 25), ('book', 20), ('chez', 1), ('jules', 1), ('for', 565), ('half', 29), ('eight', 3), ("that's", 43), ('with', 294), ('thanks', 66), ('you

In [15]:
# mode: one of "binary", "count", "tfidf", "freq"
vect_kTokenizer = kTokernizer.texts_to_matrix(X_train, mode='tfidf')

# 0 is a reserved index that won't be assigned to any word.
print(vect_kTokenizer)

[[0.   0.   0.   ... 0.   0.   0.  ]
 [0.   1.47 0.   ... 0.   0.   0.  ]
 [0.   2.49 0.   ... 0.   0.   0.  ]
 ...
 [0.   0.   1.46 ... 7.71 0.   0.  ]
 [0.   0.   3.06 ... 0.   0.   0.  ]
 [0.   3.09 0.   ... 0.   7.71 7.71]]


# Convert tfidf to pandas DataFrame

In [16]:
pd_vect_kTokenizer = pd.DataFrame(data=vect_kTokenizer[:,1:], columns=[key for key in kTokernizer.word_index])

In [17]:
pd_vect_kTokenizer

Unnamed: 0,i,to,you,a,the,u,and,in,is,my,...,yhl,09058097189,thus,attended,anand,duchess,008704050406,arrow,grandmas,hungover
0,0.000000,0.000000,1.543390,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.471014,0.000000,0.000000,0.000000,0.000000,2.040206,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2.490644,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1.471014,1.459528,0.000000,1.746841,0.000000,0.000000,0.000000,2.053381,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,1.471014,0.000000,1.543390,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,1.471014,0.000000,0.000000,1.746841,3.168112,0.000000,0.000000,2.053381,0.000000,2.297552,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,1.471014,1.459528,0.000000,0.000000,0.000000,4.868532,0.000000,0.000000,2.133069,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,1.471014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.297552,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Using WordEmbedding - a dense vector representation

In [18]:
sms_data['message'].shape

(5572,)

In [19]:
X_train_seq = kTokernizer.texts_to_sequences(X_train)
X_test_seq = kTokernizer.texts_to_sequences(X_test)

In [34]:
print(kTokernizer.word_index['he'])

71


In [29]:
print(X_train[:1])
print(X_train_seq[:1])

184    He will, you guys close?
Name: message, dtype: object
[[71, 32, 3, 338, 669]]


In [30]:
maxlen = 50
X_train_pad = keras.preprocessing.sequence.pad_sequences(sequences=X_train_seq, maxlen=maxlen)
X_test_pad = keras.preprocessing.sequence.pad_sequences(sequences=X_test_seq, maxlen=maxlen)

In [37]:
#print(X_train)
#print(X_train_seq)
#print(X_train_pad)

In [39]:
model_embed = keras.models.Sequential()
model_embed.add(keras.layers.Embedding(input_dim=9009, output_dim=10, input_length=50))
model_embed.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 10)            90090     
Total params: 90,090
Trainable params: 90,090
Non-trainable params: 0
_________________________________________________________________


In [40]:
model_embed.compile('adam', 'mse')
#model_embed.fit(X_train_pad, epochs=2)

Instructions for updating:
Use tf.cast instead.


In [41]:
pd_vect_kTokenizer

Unnamed: 0,i,to,you,a,the,u,and,in,is,my,...,yhl,09058097189,thus,attended,anand,duchess,008704050406,arrow,grandmas,hungover
0,0.000000,0.000000,1.543390,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.471014,0.000000,0.000000,0.000000,0.000000,2.040206,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2.490644,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1.471014,1.459528,0.000000,1.746841,0.000000,0.000000,0.000000,2.053381,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,1.471014,0.000000,1.543390,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,1.471014,0.000000,0.000000,1.746841,3.168112,0.000000,0.000000,2.053381,0.000000,2.297552,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,1.471014,1.459528,0.000000,0.000000,0.000000,4.868532,0.000000,0.000000,2.133069,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,1.471014,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.297552,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [43]:
#print(X_test)
#print(X_test_seq)
#print(X_test_pad)

model_embed.predict(X_test_pad)

array([[[ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        ...,
        [-0.05, -0.  ,  0.04, ...,  0.03, -0.  ,  0.03],
        [-0.05,  0.04, -0.05, ...,  0.05, -0.01, -0.04],
        [-0.  ,  0.05, -0.04, ..., -0.01, -0.03,  0.03]],

       [[ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        ...,
        [ 0.01, -0.02,  0.02, ..., -0.03,  0.03, -0.05],
        [-0.05, -0.  ,  0.04, ...,  0.03, -0.  ,  0.03],
        [-0.03,  0.04,  0.05, ...,  0.01,  0.03,  0.05]],

       [[ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        [ 0.  ,  0.03, -0.02, ...,  0.04, -0.04,  0.04],
        ...,
        [ 0.05,  0.01, -0.02, ..., -0.  ,  0.02,  0.02],
        [-0.03,  0.  ,  0.03, ..., -0.04, -0.

# DNN 