In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
#     s = unicodeToAscii(s.lower().strip())
#     s = re.sub(r"([.!?])", r" \1", s)
#     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [5]:
def readLangs(lang1, lang2, reverse=False):
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
segment_len = 100

In [15]:
def prepareData(data_path = 'FraudedRawData'):
    train_segments = []
    val_segments = []
    test_segments = []

    print("Reading lines...")
    # Read the file and split into lines
    for i in range(40):
        fname = f'{data_path}/User{i}'
        file1 = open(fname, 'r')
        Lines = file1.readlines()
        striped_lines = []
        for ind, line in enumerate(Lines):
            striped_lines += [line.rstrip('\n') ]
            if (ind+1)%100 == 0: #segment is 100 commands                
                if ind - 5000 < 0: #train data
                    train_segments.append(striped_lines)
                else: #validation/test data
                    if i<10: #validation
                        val_segments.append(striped_lines)
                    else: #test 
                        test_segments.append(striped_lines)
                striped_lines = []
                
    # Split every segment into pairs
#      = [[s for s in l.split('\t')] for l in lines]
    segment_lang = Lang()
    all_segments = train_segments+val_segments+test_segments
    print("Read %s segments" % len(all_segments))
    print("Counting words...")
    for seg in all_segments:
        segment_lang.addSentence(seg)
    print("Counted words:")
    print(segment_lang.n_words)
    return segment_lang, train_segments, val_segments, test_segments#pairs


lang, train_segments, val_segments, test_segments = prepareData()
# print(random.choice(pairs))

Reading lines...
Read 6000 segments
Counting words...
Counted words:
767


In [16]:
len(train_segments), len(val_segments), len(test_segments)

(2000, 1000, 3000)

In [17]:
# segments[0]

In [18]:
def indexesFromSentence(lang, segment):
    return [lang.word2index[word] for word in segment]


def tensorFromSentence(lang, segment):
    indexes = indexesFromSentence(lang, segment)
#     indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pairTensorsFromSentence(segment):
    #In our case of encoder-decoder the target output is the same as the input
    input_tensor = tensorFromSentence(lang, segment)
    target_tensor = tensorFromSentence(lang, segment)
    return (input_tensor, target_tensor)

In [19]:
# pairTensorsFromSentence(segments[0])

### Encoder

In [20]:
"""
The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. 
For every input word the encoder outputs a vector and a hidden state, 
and uses the hidden state for the next input word.
"""
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Attention Decoder

In [12]:
"""
If only the context vector is passed between the encoder and decoder, 
that single vector carries the burden of encoding the entire sentence.

Attention allows the decoder network to “focus” on a different part of the encoder’s outputs for every step 
of the decoder’s own outputs. First we calculate a set of attention weights. 
These will be multiplied by the encoder output vectors to create a weighted combination. 
The result (called attn_applied in the code) should contain information about that specific part of the input sequence, 
and thus help the decoder choose the right output words.
"""

'\nIf only the context vector is passed between the encoder and decoder, \nthat single vector carries the burden of encoding the entire sentence.\n\nAttention allows the decoder network to “focus” on a different part of the encoder’s outputs for every step \nof the decoder’s own outputs. First we calculate a set of attention weights. \nThese will be multiplied by the encoder output vectors to create a weighted combination. \nThe result (called attn_applied in the code) should contain information about that specific part of the input sequence, \nand thus help the decoder choose the right output words.\n'

In [13]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, seg_length=segment_len):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, seg_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out1 = nn.Linear(self.hidden_size, self.hidden_size//2)

        self.out2 = nn.Linear(self.hidden_size//2, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out1(output[0])
        output = F.log_softmax(self.out2(output), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Training

In [14]:
"""
To train we run the input sentence through the encoder, 
and keep track of every output and the latest hidden state. 
Then the decoder is given the <SOS> token as its first input, 
and the last hidden state of the encoder as its first hidden state.
"""

teacher_forcing_ratio = 1#0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, seg_length=segment_len):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(seg_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
#             if decoder_input.item() == EOS_token:
#                 break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [15]:
"""
This is a helper function to print time elapsed and estimated time remaining given the current time and progress %.
"""
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
"""
The whole training process looks like this:

* Start a timer

* Initialize optimizers and criterion

* Create set of training pairs

* Start empty losses array for plotting

Then we call train many times and occasionally print the progress (% of examples, time so far, estimated time) and average loss.
"""
def trainIters(encoder, decoder, segments, epochs=50, print_every=500, plot_every=100, learning_rate=0.001):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [pairTensorsFromSentence(seg)
                      for seg in segments]
    criterion = nn.NLLLoss()
    
    n_iters = len(segments)
    for epoch in range(1, epochs+1):
        print(f'epoch{epoch}/{epochs}:')
        for iter in range(1, n_iters + 1):
            training_pair = training_pairs[iter - 1]
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    showPlot(plot_losses)

### Plotting results:

In [17]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [18]:
hidden_size = 128
encoder1 = EncoderRNN(lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, lang.n_words, dropout_p=0).to(device)



In [19]:
# fill model with the trained weights
model_path = "attn_decoder1 - 0.2028, 0.1772, 0.1773, 0.2168.pth"
attn_decoder1.load_state_dict(torch.load(model_path,  map_location=torch.device('cpu')))
# attn_decoder1 = model.to(device)

<All keys matched successfully>

In [20]:
# fill model with the trained weights
model_path = "encoder1 - 0.2028, 0.1772, 0.1773, 0.2168.pth"
encoder1.load_state_dict(torch.load(model_path,  map_location=torch.device('cpu')))


<All keys matched successfully>

In [21]:
# trainIters(encoder1, attn_decoder1, segments=train_segments, print_every=500)

In [22]:
# torch.save(encoder1.state_dict(), f"encoder1 - night 64 do1.pth")
# torch.save(attn_decoder1.state_dict(), f"attn_decoder1 - night 64 do1.pth")


### Evaluation:

In [23]:
"""
Evaluation is mostly the same as training, 
but there are no targets so we simply feed the decoder’s predictions back to itself for each step. 
Every time it predicts a word we add it to the output string, and if it predicts the EOS token we stop there. 
We also store the decoder’s attention outputs for display later.
"""
def evaluate(encoder, decoder, sentence, seg_length=segment_len):
    mistakes = 0
    with torch.no_grad():
        input_tensor = tensorFromSentence(lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(seg_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(seg_length, seg_length)

        for di in range(seg_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            desired_i = input_tensor[di]
            top10 = decoder_output.data.topk(50)[1].view(-1).numpy()
            cur_mistake = 100
            for index, i in enumerate(top10):
                if desired_i == i: cur_mistake=index
#             print(di, cur_mistake, mistakes)
            mistakes+=cur_mistake
#             print(decoder_output.data, topv, topi)
#             break
#             if topi.item() == EOS_token:
#                 decoded_words.append('<EOS>')
#                 break
#             else:
            decoded_words.append(lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1], mistakes

In [24]:
import pandas as pd
challengeToFill = pd.read_csv('challengeToFill.csv', index_col=False)
challengeToFillFilledSeq2Seqmodel = challengeToFill.copy()


# Evaluate on validation set:

In [25]:
#iter iver validation set csv:
y_val = []
for user_ind in range(0,10):
    for segment_ind in range(51,151):
        label = challengeToFillFilledSeq2Seqmodel.iloc[user_ind,segment_ind]
#         print(user_ind, segment_ind-1, label)
        y_val.append(label)

In [39]:
from tqdm import tqdm

TP = 0
TN = 0
FP = 0
FN = 0
for ind, seg in tqdm(enumerate(val_segments), total= len(val_segments)):
    output_words, attentions, mistakes = evaluate(encoder1, attn_decoder1, seg)
    alert = False
#     print(ind, mistakes)
    if mistakes>2500: 
#         print('gatcha--------------------------')
#         print(ind, mistakes)
        alert = True
    if y_val[ind] == 0:#legitimate
        if alert: FP+=1
        else: TN+=1
    else:
        if alert: TP += 1
        else: FN += 1

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:01<00:00,  8.23it/s]


In [29]:
#mistakes>500: 
print(f'TP:{TP}, TN:{TN}, FP:{FP}, FN:{FN}')
grade = 9*TP + TN
print(grade)

TP:95, TN:110, FP:790, FN:5
965


In [38]:
#mistakes>1500: 
print(f'TP:{TP}, TN:{TN}, FP:{FP}, FN:{FN}')
grade = 9*TP + TN
print(grade)

TP:90, TN:189, FP:711, FN:10
999


In [40]:
#mistakes>2500: 
print(f'TP:{TP}, TN:{TN}, FP:{FP}, FN:{FN}')
grade = 9*TP + TN
print(grade)

TP:74, TN:268, FP:632, FN:26
934


# Evaluate on test set:

In [33]:
y_preds = []
for ind, seg in tqdm(enumerate(test_segments), total= len(test_segments)):
    output_words, attentions, mistakes = evaluate(encoder1, attn_decoder1, seg)
    alert = 0
#     print(ind, mistakes)
    if mistakes>500: 
#         print('gatcha--------------------------')
#         print(ind, mistakes)
        alert = 1
    y_preds.append(alert)

#iter over test set csv:
preds_counter = 0
for user_ind in range(10,40):
    for segment_ind in range(51,151):
        challengeToFillFilledSeq2Seqmodel.iloc[user_ind,segment_ind] = y_preds[preds_counter]
        preds_counter+=1
#         print(user_ind, segment_ind-1, label)
        

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [05:58<00:00,  8.37it/s]


In [34]:
challengeToFillFilledSeq2Seqmodel.to_csv('challengeToFill_filled_Seq2Seq_all_users500mistakes.csv')

In [41]:
y_preds = []
for ind, seg in tqdm(enumerate(test_segments), total= len(test_segments)):
    output_words, attentions, mistakes = evaluate(encoder1, attn_decoder1, seg)
    alert = 0
#     print(ind, mistakes)
    if mistakes>2500: 
#         print('gatcha--------------------------')
#         print(ind, mistakes)
        alert = 1
    y_preds.append(alert)

#iter over test set csv:
preds_counter = 0
for user_ind in range(10,40):
    for segment_ind in range(51,151):
        challengeToFillFilledSeq2Seqmodel.iloc[user_ind,segment_ind] = y_preds[preds_counter]
        preds_counter+=1
#         print(user_ind, segment_ind-1, label)
        

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [05:22<00:00,  9.30it/s]


In [42]:
challengeToFillFilledSeq2Seqmodel.to_csv('challengeToFill_filled_Seq2Seq_all_users2500mistakes.csv')

In [43]:
y_preds = []
for ind, seg in tqdm(enumerate(test_segments), total= len(test_segments)):
    output_words, attentions, mistakes = evaluate(encoder1, attn_decoder1, seg)
    alert = 0
#     print(ind, mistakes)
    if mistakes>3500: 
#         print('gatcha--------------------------')
#         print(ind, mistakes)
        alert = 1
    y_preds.append(alert)

#iter over test set csv:
preds_counter = 0
for user_ind in range(10,40):
    for segment_ind in range(51,151):
        challengeToFillFilledSeq2Seqmodel.iloc[user_ind,segment_ind] = y_preds[preds_counter]
        preds_counter+=1
#         print(user_ind, segment_ind-1, label)
        

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [05:39<00:00,  8.84it/s]


In [44]:
challengeToFillFilledSeq2Seqmodel.to_csv('challengeToFill_filled_Seq2Seq_all_users3500mistakes.csv')