# LSTM on Fraud Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

# 方法1
df=pd.read_csv('./spam.csv')
df['target'] = df['label'].map( {'spam':1, 'ham':0 })
print(df.head())

# 方法2
# conditions = [
#     (df['label']=='ham'),
#     (df['label']=='spam')
#     ]
# values = ['0', '1']
# df['tier'] = np.select(conditions, values)


  label                                                sms  target
0   ham  Go until jurong point, crazy.. Available only ...       0
1   ham                      Ok lar... Joking wif u oni...       0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...       1
3   ham  U dun say so early hor... U c already then say...       0
4   ham  Nah I don't think he goes to usf, he lives aro...       0


In [2]:
from sklearn.model_selection import train_test_split
X = df['sms'].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(X_train[:5])
print(y_train[:5])
print('==============')
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

['Sleeping nt feeling well'
 'Come aftr  &lt;DECIMAL&gt; ..now i m cleaning the house'
 'Almost there, see u in a sec' 'Yeah, probably earlier than that'
 'Hello, my love. What are you doing? Did you get to that interview today? Are you you happy? Are you being a good boy? Do you think of me?Are you missing me ?']
[0 0 0 0 0]
(4457,)
(1115,)
(4457,)


In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_dict = tokenizer.index_word

print(len(word_dict))
print(word_dict)

8004


In [4]:
for key in word_dict.keys():
    print(key, word_dict[key])

1 i
2 to
3 you
4 a
5 the
6 u
7 and
8 in
9 is
10 me
11 my
12 for
13 your
14 it
15 of
16 call
17 have
18 that
19 on
20 now
21 2
22 so
23 are
24 but
25 or
26 do
27 can
28 not
29 if
30 i'm
31 with
32 at
33 ur
34 get
35 will
36 just
37 be
38 no
39 we
40 this
41 up
42 4
43 gt
44 lt
45 when
46 go
47 ok
48 from
49 how
50 free
51 all
52 know
53 what
54 out
55 then
56 like
57 good
58 got
59 come
60 was
61 am
62 its
63 day
64 time
65 only
66 love
67 he
68 there
69 want
70 text
71 send
72 as
73 i'll
74 lor
75 by
76 need
77 one
78 see
79 back
80 going
81 txt
82 about
83 still
84 r
85 sorry
86 today
87 k
88 home
89 stop
90 dont
91 please
92 our
93 her
94 she
95 mobile
96 take
97 any
98 reply
99 don't
100 n
101 da
102 tell
103 they
104 hi
105 pls
106 think
107 new
108 later
109 been
110 here
111 some
112 ì
113 did
114 much
115 msg
116 week
117 an
118 too
119 has
120 who
121 dear
122 oh
123 night
124 well
125 1
126 d
127 phone
128 great
129 give
130 way
131 work
132 had
133 c
134 make
135 claim
136 mo

In [5]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

print(X_train_seq[0])
for el in X_train_seq[0]:
    print(word_dict[el], end=' ')

[709, 808, 604, 124]
sleeping nt feeling well 

In [6]:
X_train_pad = pad_sequences(X_train_seq, maxlen=20, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=20, padding='post')
print(X_train_pad[0])
print(X_train_pad.shape)

[709 808 604 124   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
(4457, 20)


In [7]:
laenge_pads = 20


lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=len(word_dict)+1, output_dim=20, input_length=laenge_pads))
lstm_model.add(LSTM(400))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 20)            160100    
                                                                 
 lstm (LSTM)                 (None, 400)               673600    
                                                                 
 dense (Dense)               (None, 1)                 401       
                                                                 
Total params: 834,101
Trainable params: 834,101
Non-trainable params: 0
_________________________________________________________________


In [8]:
history = lstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
sms_test = ['Hi Paul, would you come around tonight']
sms_seq = tokenizer.texts_to_sequences(sms_test)

sms_pad = pad_sequences(sms_seq, maxlen=20, padding='post')
print(sms_pad)
#lstm_model.predict_classes(sms_pad)
#np.argmax(lstm_model.predict(sms_pad), axis=-1)
predict_x=lstm_model.predict(sms_pad) 
classes_x=np.argmax(predict_x,axis=1)
print(classes_x)

[[ 104 7302  163    3   59  215  231    0    0    0    0    0    0    0
     0    0    0    0    0    0]]
[0]


In [15]:
sms_test = ['Free SMS service for anyone']
sms_seq = tokenizer.texts_to_sequences(sms_test)

sms_pad = pad_sequences(sms_seq, maxlen=20, padding='post')
print(sms_pad)
predict_x=lstm_model.predict(sms_pad) 
classes_x=np.argmax(predict_x,axis=1)
print(classes_x)

[[ 50 252 219  12 645   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
[0]
